/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2017 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/*! \file filter.h

    \brief generic implementation of an n-pole forward-backward IIR filter for nD arrays
    
    This code was initially part of vspline's prefilter.h, but I factored it out
    and disentangled it from the remainder of the code, since it's more general and
    not specific to B-splines.
    
    The code in this file provides efficient filtering of nD arrays with an n-pole
    forward-backward recursive filter, accepting a variety of boundary conditions and
    optionally using multithreading and/or vectorization to speed things up.

    The data have to be presented as vigra MultiArrayViews of elementary floating point
    types or their 'aggregates' (TinyVectors, pixels, etc.), the code is dimension-agnostic
    but templated to the array types used, so the dimensionality is not a run time parameter.
    
    Note the code organization is bottom-up, so the highest-level code comes last.
    Most code using filter.h will only call the final routine, filter_nd - and user code,
    working with vspline::bspline, will not directly call code in this file at all.
    
    While the initial purpose for the code in this file was, of course, b-spline prefiltering,
    the generalized version I present here can be used for arbitrary filters. There is probably
    one other filter which is most useful in the context of vspline: passing a single positive
    pole in the range of ] 0 , 1 [ smoothes the signal very efficiently.
*/

// include common.h for the border condition codes

#include <vector>
#include "common.h"

#ifndef VSPLINE_FILTER_H
#define VSPLINE_FILTER_H

namespace vspline {

/// overall_gain is a helper routine:
/// Simply executing the filtering code by itself will attenuate the signal. Here
/// we calculate the gain which, pre-applied to the signal, will cancel this effect.
/// While this code was initially part of the filter's constructor, I took it out
/// to gain some flexibility by passing in the gain as a parameter.

static long double overall_gain ( const int & nbpoles ,
                                  const long double * const pole )
{
  long double lambda = 1.0L ;

  for ( int k = 0 ; k < nbpoles ; k++ )

    lambda = lambda * ( 1.0L - pole[k] ) * ( 1.0L - 1.0L / pole[k] ) ;
  
  return lambda ;
}
  
/// for each pole passed in, this filter will perform a forward-backward
/// first order IIR filter, initially on the data passed in via in_iter, subsequently
/// on the result of the application of the previous pole, using these recursions:
/// 
/// forward filter:
///   
/// x[n]' = x[n] + p * x[n-1]
/// 
/// backward filter:
/// 
/// x[n]'' = p * ( x[n+1]' - x[n]' )
/// 
/// the result will be deposited via out_iter, which may be an iterator over
/// the same data in_iter iterates over, in which case operation is in-place.
/// in_iter can be a const iterator, it's never used for writing data.
///
/// class filter needs three template arguments, one for the type of iterator over the
/// incoming data, one for the type of iterator to the resultant coefficients, and one
/// for the real type used in arithmetic operations. The iterators' types will usually
/// be the same, but formulating the code with two separate types makes it more
/// versatile. The third template argument will usually be the elementary
/// type of the iterator's value_type. When the value_types are vigra aggregates
/// (TinyVectors etc.) vigra's ExpandElementResult mechanism will provide, but at times
/// we may wish to be explicit here, e.g. when iterating over simdized types.

template < typename in_iter ,   // iterator over the knot point values
           typename out_iter ,  // iterator over the coefficient array
           typename real_type > // type for single real value for calculations
class filter
{
  // both iterators must define value_type and have the same value_type

  typedef typename in_iter::value_type value_type ;
  
  static_assert ( std::is_same < typename out_iter::value_type , value_type > :: value ,
                  "prefilter input and output iterator must have the same value_type" ) ;
  
//   // both iterators should be random access iterators.
//   // currently not enforced
//   typedef typename std::iterator_traits < in_iter > :: iterator_category in_cat ;
//   static_assert ( std::is_same < in_cat , std::random_access_iterator_tag > :: value ,
//                   "prefilter input iterator must be random access iterator"  ) ;
//                   
//   typedef typename std::iterator_traits < out_iter > :: iterator_category out_cat ;
//   static_assert ( std::is_same < out_cat , std::random_access_iterator_tag > :: value ,
//                   "prefilter output iterator must be random access iterator" ) ;
  
  /// typedef the fully qualified type for brevity, to make the typedefs below
  /// a bit more legible

  typedef filter < in_iter , out_iter , real_type > filter_type ;

  const long double* pole ;               ///< poles of the IIR filter
  std::vector<int> horizon ;         ///< corresponding horizon values
  const real_type lambda ;           ///< (potentiated) overall gain.  
  const int npoles ;                 ///< Number of filter poles
  const int M ;                      ///< length of the data

  /// the solving routine and initial coefficient finding routines are called via method pointers.
  /// these pointers are typedefed for better legibility:
  
  typedef void       ( filter_type::*p_solve )  ( in_iter  input , out_iter output ) ;
  typedef value_type ( filter_type::*p_icc1 )  ( in_iter  input , int k ) ;
  typedef value_type ( filter_type::*p_icc2 )  ( out_iter input , int k ) ;
  typedef value_type ( filter_type::*p_iacc )  ( out_iter input , int k ) ;

  
  // these are the method pointers used:
  
  p_solve _p_solve ; ///< pointer to the solve method
  p_icc1  _p_icc1 ;  ///< pointer to calculation of initial causal coefficient with different
  p_icc2  _p_icc2 ;  ///< and equal data types of input and output
  p_iacc  _p_iacc ;  ///< pointer to calculation of initial anticausal coefficient
  
public:

 /// solve() takes two iterators, one to the input data and one to the output space.
 /// The containers must have the same size. It's safe to use solve() in-place.

 void solve ( in_iter input , out_iter output )
 {
   (this->*_p_solve) ( input , output ) ;
 }
 
 /// for in-place operation we use the same filter routine.
 /// I checked: a handcoded in-place routine using only a single
 /// iterator is not noticeably faster than using one with two separate iterators.
 
 void solve ( out_iter data )
 {
   (this->*_p_solve) ( data , data ) ;
 }
 
// I use adapted versions of P. Thevenaz' code to calculate the initial causal and
// anticausal coefficients for the filter. The code is changed just a little to work
// with an iterator instead of a C vector.

private:

/// The code for mirrored BCs is adapted from P. Thevenaz' code, the other routines are my
/// own doing, with aid from a digest of spline formulae I received from P. Thevenaz and which
/// were helpful to verify the code against a trusted source.
///
/// note how, in the routines to find the initial causal coefficient, there are two different
/// cases: first the 'accelerated loop', which is used when the theoretically infinite sum of
/// terms has reached sufficient precision, and the 'full loop', which implements the mathematically
/// precise representation of the limes of the infinite sum towards an infinite number of terms,
/// which happens to be calculable due to the fact that the absolute value of all poles is < 1 and
///
///  lim     n                a
///         sum a * q ^ k =  ---
/// n->inf  k=0              1-q
///
/// first are mirror BCs. This is mirroring 'on bounds',
/// f(-x) == f(x) and f(n-1 - x) == f(n-1 + x)
///
/// note how mirror BCs are equivalent to requiring the first derivative to be zero in the
/// linear algebra approach. Obviously with mirrored data this has to be the case; the location
/// where mirroring occurs is always an extremum. So this case covers 'FLAT' BCs as well
///
/// the initial causal coefficient routines are templated by iterator type, because depending
/// on the circumstances, they may be used either on the input or the output iterator.
  
template < class IT >
value_type icc_mirror ( IT c , int k )
{
  value_type z = value_type ( pole[k] ) ;
  value_type zn, z2n, iz;
  value_type Sum ;
  int  n ;

  if (horizon[k] < M) {
    /* accelerated loop */
    zn = z;
    Sum = c[0];
    for (n = 1; n < horizon[k]; n++)
    {
      Sum += zn * c[n];
      zn *= z;
    }
  }
  else {
    /* full loop */
    zn = z;
    iz = value_type(1.0) / z;
    z2n = value_type ( pow(double(pole[k]), double(M - 1)) );
    Sum = c[0] + z2n * c[M - 1];
    z2n *= z2n * iz;
    for (n = 1; n <= M - 2; n++)
    {
      Sum += (zn + z2n) * c[n];
      zn *= z;
      z2n *= iz;
    }
    Sum /= (value_type(1.0) - zn * zn);
  } 
//  cout << "icc_mirror: " << Sum << endl ;
 return(Sum);
}

/// the initial anticausal coefficient routines are always called with the output iterator,
/// so they needn't be templated like the icc routines.
///
/// I still haven't understood the 'magic' which allows to calculate the initial anticausal
/// coefficient from just two results of the causal filter, but I assume it's some exploitation
/// of the symmetry of the data. This code is adapted from P. Thevenaz'.

value_type iacc_mirror ( out_iter c , int k )
{
  value_type z = value_type ( pole[k] ) ;

  return( value_type( z / ( z * z - value_type(1.0) ) ) * ( c [ M - 1 ] + z * c [ M - 2 ] ) );
}

/// next are 'antimirrored' BCs. This is the same as 'natural' BCs: the signal is
/// extrapolated via point mirroring at the ends, resulting in point-symmetry at the ends,
/// which is equivalent to the second derivative being zero, the constraint used in
/// the linear algebra approach to calculate 'natural' BCs:
///
/// f(x) - f(0) == f(0) - f(-x); f(x+n-1) - f(n-1) == f(n-1) - f (n-1-x)

template < class IT >
value_type icc_natural ( IT c , int k )
{
  value_type z = value_type ( pole[k] ) ;
  value_type zn, z2n, iz;
  value_type Sum , c02 ;
  int  n ;

  // f(x) - f(0) == f(0) - f(-x)
  // f(-x) == 2 * f(0) - f(x)
  
  if (horizon[k] < M)
  {
    c02 = c[0] + c[0] ;
    zn = z;
    Sum = c[0];
    for (n = 1; n < horizon[k]; n++)
    {
      Sum += zn * ( c02 - c[n] ) ;
      zn *= z;
    }
    return(Sum);
  }
  else {
    zn = z;
    iz = value_type(1.0) / z;
    z2n = value_type ( pow(double(pole[k]), double(M - 1)) );
    Sum = value_type( ( value_type(1.0) + z ) / ( value_type(1.0) - z ) )
          * ( c[0] - z2n * c[M - 1] );
    z2n *= z2n * iz;                                                   // z2n == z^2M-3
    for (n = 1; n <= M - 2; n++)
    {
      Sum -= (zn - z2n) * c[n];
      zn *= z;
      z2n *= iz;
    }
    return(Sum / (value_type(1.0) - zn * zn));
  } 
}

/// I still haven't understood the 'magic' which allows to calculate the initial anticausal
/// coefficient from just two results of the causal filter, but I assume it's some exploitation
/// of the symmetry of the data. This code is adapted from P. Thevenaz' formula.

value_type iacc_natural ( out_iter c , int k )
{
  value_type z = value_type ( pole[k] ) ;

  return - value_type( z / ( ( value_type(1.0) - z ) * ( value_type(1.0) - z ) ) ) * ( c [ M - 1 ] - z * c [ M - 2 ] ) ;
}

/// next are reflective BCs. This is mirroring 'between bounds':
///
/// f ( -1 - x ) == f ( x ) and f ( n + x ) == f ( n-1 - x )
///
/// I took Thevenaz' routine for mirrored data as a template and adapted it.
/// 'reflective' BCs have some nice properties which make them more suited than mirror BCs in
/// some situations:
/// - the artificial discontinuity is 'pushed out' half a unit spacing
/// - the extrapolated data are just as long as the source data
/// - they play well with even splines

template < class IT >
value_type icc_reflect ( IT c , int k )
{
  value_type z = value_type ( pole[k] ) ;
  value_type zn, z2n, iz;
  value_type Sum ;
  int  n ;

  if (horizon[k] < M)
  {
    zn = z;
    Sum = c[0];
    for (n = 0; n < horizon[k]; n++)
    {
      Sum += zn * c[n];
      zn *= z;
    }
    return(Sum);
  }
  else
  {
    zn = z;
    iz = value_type(1.0) / z;
    z2n = value_type ( pow(double(pole[k]), double(2 * M)) );
    Sum = 0 ;
    for (n = 0; n < M - 1 ; n++)
    {
      Sum += (zn + z2n) * c[n];
      zn *= z;
      z2n *= iz;
    }
    Sum += (zn + z2n) * c[n];
    return c[0] + Sum / (value_type(1.0) - zn * zn) ;
  } 
}

/// I still haven't understood the 'magic' which allows to calculate the initial anticausal
/// coefficient from just one result of the causal filter, but I assume it's some exploitation
/// of the symmetry of the data. I have to thank P. Thevenaz for his formula which let me code:

value_type iacc_reflect ( out_iter c , int k )
{
  value_type z = value_type ( pole[k] ) ;

  return c[M - 1] / ( value_type(1.0) - value_type(1.0) / z ) ;
}

/// next is periodic BCs. so, f(x) = f(x+N)
///
/// Implementing this is more straightforward than implementing the various mirrored types.
/// The mirrored types are, in fact, also periodic, but with a period twice as large, since they
/// repeat only after the first reflection. So especially the code for the full loop is more complex
/// for mirrored types. The down side here is the lack of symmetry to exploit, which made me code
/// a loop for the initial anticausal coefficient as well.

template < class IT >
value_type icc_periodic ( IT c , int k )
{
  value_type z = value_type ( pole[k] ) ;
  value_type zn ;
  value_type Sum ;
  int  n ;

  if (horizon[k] < M)
  {
    zn = z ;
    Sum = c[0] ;
    for ( n = M - 1 ; n > ( M - horizon[k] ) ; n-- )
    {
      Sum += zn * c[n];
      zn *= z;
    }
   }
  else
  {
    zn = z;
    Sum = c[0];
    for ( n = M - 1 ; n > 0 ; n-- )
    {
      Sum += zn * c[n];
      zn *= z;
    }
    Sum /= ( value_type(1.0) - zn ) ;
  }
 return Sum ;
}

// TODO doublecheck this routine!

value_type iacc_periodic ( out_iter c , int k )
{
  value_type z = value_type ( pole[k] ) ;
  value_type zn ;
  value_type Sum ;

  if (horizon[k] < M)
  {
    zn = z ;
    Sum = c[M-1] * z ;
    for ( int n = 0 ; n < horizon[k] ; n++ )
    {
      zn *= z;
      Sum += zn * c[n];
    }
    Sum = -Sum ;
  }
  else
  {
    zn = z;
    Sum = c[M-1];
    for ( int n = 0 ; n < M - 1 ; n++ )
    {
      Sum += zn * c[n];
      zn *= z;
    }
    Sum = z * Sum / ( zn - value_type(1.0) );
  }
  return Sum ;
}

/// guess the initial coefficient. This tries to minimize the effect
/// of starting out with a hard discontinuity as it occurs with zero-padding,
/// while at the same time requiring little arithmetic effort
///
/// for the forward filter, we guess an extrapolation of the signal to the left
/// repeating c[0] indefinitely, which is cheap to compute:

template < class IT >
value_type icc_guess ( IT c , int k )
{
  return c[0] * value_type ( 1.0 / ( 1.0 - pole[k] ) ) ;
}

// for the backward filter, we assume mirror BC, which is also cheap to compute:

value_type iacc_guess ( out_iter c , int k )
{
  return iacc_mirror ( c , k ) ;
}

template < class IT >
value_type icc_identity ( IT c , int k )
{
  return c[0] ;
}

value_type iacc_identity ( out_iter c , int k )
{
  return c[M-1] ;
}

/// now we come to the solving, or prefiltering code itself.
/// there are some variants - a bit of code bloat due to the explicit handling of a few
/// distinct cases; since this is core code I have opted to suffer some code duplication
/// in exchange for maximum efficiency.
/// The code itself is adapted from P. Thevenaz' code.
///
/// This variant uses a 'carry' element, 'X', to carry the result of the recursion
/// from one iteration to the next instead of using the direct implementation of the
/// recursion formula, which would read the previous value of the recursion from memory
/// by accessing x[n-1], or, x[n+1], respectively.

void solve_gain_inlined ( in_iter c , out_iter x )
{
  assert ( M > 1 ) ;
  
  // use a buffer of one value_type for the recursion (see below)

  value_type X ;
  real_type p = real_type ( pole[0] ) ;
  
  // process first pole, applying overall gain in the process
  // of consuming the input. This gain may be a power of the 'orthodox'
  // lambda from Thevenaz' code. This is done when the input is multidimensional,
  // in which case it's wasteful to apply lambda in each dimension. In this situation
  // it makes more sense to apply pow(lambda,dimensions) when solving along the
  // first axis and apply no gain when solving along the other axes.
  // Also note that the application of the gain is performed during the processing
  // of the first (maybe the only) pole of the filter, instead of running a separate
  // loop over the input to apply it before processing starts.
  
  // note how the gain is applied to the initial causal coefficient. This is
  // equivalent to first applying the gain to the input and then calculating
  // the initial causal coefficient from the amplified input.
  
  // note the seemingly strange = X clause in the asignment. By performing this
  // assignment, we buffer the result of the current filter step to be used in the
  // next iteration instead of fetching it again from memory. In my trials, this
  // performed better, especially on SIMD data.
  
  x[0] = X = value_type ( lambda ) * (this->*_p_icc1) (c, 0);

  /* causal recursion */
  // the gain is applied to each input value as it is consumed
  
  for (int n = 1; n < M; n++)
  {
    x[n] = X = value_type ( lambda ) * c[n] + value_type ( p ) * X ;
  }
  
  // now the input is used up and won't be looked at any more; all subsequent
  // processing operates on the output.
  
  /* anticausal initialization */
  
  x[M - 1] = X = (this->*_p_iacc)(x, 0);

  /* anticausal recursion */
  for (int n = M - 2; 0 <= n; n--)
  {
    x[n] = X = value_type ( p ) * ( X - x[n]);
  }
  
  // for the remaining poles, if any, don't apply the gain
  // and process the result from applying the first pole
  
  for (int k = 1; k < npoles; k++)
  {
    p = pole[k] ;
    /* causal initialization */
    x[0] = X = (this->*_p_icc2)(x, k);
    
    /* causal recursion */
    for (int n = 1; n < M; n++)
    {
      x[n] = X = x[n] + value_type ( p ) * X ;
    }
    
    /* anticausal initialization */
    x[M - 1] = X = (this->*_p_iacc)(x, k);
    
    /* anticausal recursion */
    for (int n = M - 2; 0 <= n; n--)
    {
      x[n] = X = value_type ( p ) * ( X - x[n] );
    }
  }
}

/// solve routine without application of any gain, it is assumed that this has been
/// done already during an initial run with the routine above, or in some other way.

void solve_no_gain ( in_iter c , out_iter x )
{
  assert ( M > 1 ) ;

  value_type X ;
  real_type p = real_type ( pole[0] ) ;
  
  // process first pole, consuming the input
  
  /* causal initialization */
  x[0] = X = (this->*_p_icc1)(c, 0);
  
  /* causal recursion */
  for ( int n = 1; n < M; n++)
  {
    x[n] = X = c[n] + value_type ( p ) * X ;
  }
  
  /* anticausal initialization */
  x[M - 1] = X = (this->*_p_iacc)(x, 0);
  
  /* anticausal recursion */
  for ( int n = M - 2; 0 <= n; n--)
  {
    x[n] = X = value_type ( p ) * ( X - x[n]);
  }
  
  // for the remaining poles, if any, work on the result
  // of processing the first pole
  
  for ( int k = 1 ; k < npoles; k++)
  {
    p = pole[k] ;
    /* causal initialization */
    x[0] = X = (this->*_p_icc2)(x, k);
    
    /* causal recursion */
    for (int n = 1; n < M; n++)
    {
      x[n] = X = x[n] + value_type ( p ) * X ;
    }
    
    /* anticausal initialization */
    x[M - 1] = X = (this->*_p_iacc)(x, k);
    
    /* anticausal recursion */
    for (int n = M - 2; 0 <= n; n--)
    {
      x[n] = X = value_type ( p ) * ( X - x[n] );
    }
  }
}

/// shortcircuit routine, copies input to output
///
/// this routine can also be used for splines of degree 0 and 1, for simplicity's sake

void solve_identity ( in_iter c , out_iter x )
{
  if ( &(*x) == &(*c) ) // if operation is in-place we needn't do anything
    return ;
  for ( int n = 0 ; n < M ; n++ ) // otherwise, copy input to output
    x[n] = c[n] ;
}

/// The last bit of work left in class filter is the constructor.
/// The number of input/output values is passed into the constructur, limiting the
/// filter to operate on data precisely of this length. apply_gain isn't immediately
/// obvious: it's not a mere flag, but contains the exponent which should be applied
/// to the gain. If, for example, a 2D spline is built, one might pass in 2 here for
/// the first dimension, and 0 for the second. This way, one set of multiplications is
/// saved, at the cost of slightly reduced accuracy for large spline degrees. For high
/// spline degrees and higher dimensions, it's advisable to not use this mechanism and
/// pass in apply_gain = 1 for all dimensions; the calling code in filter.h decides this
/// with a heuristic.
/// The number of poles and a pointer to the poles themselves are passed in with the
/// parameters _nbpoles and _pole, respectively.
/// Finally, the last parameter, tolerance, gives a measure of the acceptable error.

public:
  
filter ( int _M ,               ///< number of input/output elements (DataLength)
         double gain ,          ///< gain to apply to the signal to cancel attenuation
         bc_code bc ,           ///< boundary conditions for this filter
         int _npoles ,          ///< number of poles
         const long double * _pole , ///< pointer to _npoles doubles holding the filter poles
         double tolerance )     ///< acceptable loss of precision, absolute value
: M ( _M ) ,
  npoles ( _npoles ) ,
  pole ( _pole ) ,
  lambda ( gain )
{
  if ( npoles < 1 )
  {
    // zero poles means there's nothing to do but possibly
    // copying the input to the output, which solve_identity
    // will do if the operation isn't in-place
    _p_solve = & filter_type::solve_identity ;
    return ;
  }
  
  // calculate the horizon for each pole, this is the number of iterations
  // the filter must perform on a unit impulse (TODO doublecheck) for it to
  // decay below 'tolerance'

  for ( int i = 0 ; i < npoles ; i++ )
  {
    if ( tolerance )
      horizon.push_back ( ceil ( log ( tolerance ) / log ( fabs ( pole[i] ) ) ) ) ;
    else
      horizon.push_back ( M ) ;
  }

  if ( gain == 1.0 )
  {
    // gain == 1.0 has no effect, we can use this solve variant, applying no gain:
    _p_solve = & filter_type::solve_no_gain ;
  }
  else
  {
    // if gain isn't 1.0, we use the solve variant which applies it
    // to the signal as it goes along.
    _p_solve = & filter_type::solve_gain_inlined ;
  }

  // while the forward/backward IIR filter in the solve_... routines is the same for all
  // boundary conditions, the calculation of the initial causal and anticausal coefficients
  // depends on the boundary conditions and is handled by a call through a method pointer
  // in the solve_... routines. Here we fix these method pointers:
  
  if ( bc == MIRROR )
  {     
    _p_icc1 = & filter_type::icc_mirror<in_iter> ;
    _p_icc2 = & filter_type::icc_mirror<out_iter> ;
    _p_iacc = & filter_type::iacc_mirror ;
  }
  else if ( bc == NATURAL )
  {     
    _p_icc1 = & filter_type::icc_natural<in_iter> ;
    _p_icc2 = & filter_type::icc_natural<out_iter> ;
    _p_iacc = & filter_type::iacc_natural ;
  }
  else if ( bc == PERIODIC )
  {
    _p_icc1 = & filter_type::icc_periodic<in_iter> ;
    _p_icc2 = & filter_type::icc_periodic<out_iter> ;
    _p_iacc = & filter_type::iacc_periodic ;
  }
  else if ( bc == REFLECT )
  {
    _p_icc1 = & filter_type::icc_reflect<in_iter> ;
    _p_icc2 = & filter_type::icc_reflect<out_iter> ;
    _p_iacc = & filter_type::iacc_reflect ;
  }
  else if ( bc == ZEROPAD )
  {
    _p_icc1 = & filter_type::icc_identity<in_iter> ;
    _p_icc2 = & filter_type::icc_identity<out_iter> ;
    _p_iacc = & filter_type::iacc_identity ;
  }
  else if ( bc == IDENTITY )
  {
    _p_solve = & filter_type::solve_identity ;
  }
  else if ( bc == GUESS )
  {
    _p_icc1 = & filter_type::icc_guess<in_iter> ;
    _p_icc2 = & filter_type::icc_guess<out_iter> ;
    _p_iacc = & filter_type::iacc_guess ;
  }
  else
  {
    std::cout << "boundary condition " << bc << " not supported by vspline::filter" << std::endl ;
    throw not_supported ( "boundary condition not supported by vspline::filter" ) ;
  }
}

} ; // end of class filter

// Now that we have generic code for 1D filtering, we want to apply this code to
// n-dimensional arrays. We use the following strategy:
// - perform the prefiltering collinear to each axis separately
// - when processing a specific axis, split the array(s) into chunks and use one job per chunk
// - perform a traversal on each chunk, copying out subsets collinear to the processing axis
//   to a buffer
// - perform the filter on the buffer
// - copy the filtered data to the target

// The code is organized bottom-up, with the highest-level routines furthest down, saving
// on forward declarations. The section of code immediately following doesn't use vectorization,
// the vector code follows.

/// 'monadic' gather and scatter. gather picks up count source_type which are stride apart,
/// starting at source and depositing compactly at target. scatter performs the reverse
/// operation. source_type and target_type can be different; on assignment source_type is
/// simply cast to target_type.
///
/// index_type is passed in as a template argument, allowing for wider types than int,
/// so these routines can also operate on very large areas of memory.

template < typename source_type ,
           typename target_type = source_type ,
           typename index_type = int >
void gather ( const source_type* source ,
              target_type* target ,
              const index_type & stride ,
              index_type count
            )
{
  while ( count-- )
  {
    *target = target_type ( *source ) ;
    source += stride ;
    ++target ;
  }
}

template < typename source_type ,
           typename target_type = source_type ,
           typename index_type = int >
void scatter ( const source_type* source ,
               target_type* target ,
               const index_type & stride ,
               index_type count
             )
{
  while ( count-- )
  {
    *target = target_type ( *source ) ;
    ++source ;
    target += stride ;
  }
}

/// nonaggregating_filter subsequently copies all 1D subarrays of source collinear to axis
/// into a 1D buffer, performs the filter 'solver' on the buffer, then writes the filtered
/// data to the corresponding 1D subarray of target (which may be the same as source).
/// While the buffering consumes some time, it saves time on the actual filter calculation,
/// especially with higher-order filters. On my system, I found I broke even even with only
/// one pole, so there is no special treatment here for low-order filtering (TODO confirm)
/// note the use of range_type<T>, which is from multithread.h
/// we derive the index type for the call to the monadic gather/scatter routines
/// automatically, so here it comes out as vigra's difference_type_1

template < class source_view_type ,
           class target_view_type ,
           class math_type >
void nonaggregating_filter ( vspline::range_type
                              < typename source_view_type::difference_type > range ,
                             source_view_type * p_original_source ,
                             target_view_type * p_original_target ,
                             int axis ,
                             double gain ,
                             bc_code bc ,
                             int nbpoles ,
                             const long double * pole ,
                             double tolerance
                           )
{
  typedef typename source_view_type::value_type source_type ;
  typedef typename target_view_type::value_type target_type ;

  // we're in the single-threaded code now. multithread() has simply forwarded
  // the source and target MultiArrayViews and a range, here we use the range
  // to pick out the subarrays of original_source and original_target which we
  // are meant to process in this thread:

  const auto source = p_original_source->subarray ( range[0] , range[1] ) ;
  auto target = p_original_target->subarray ( range[0] , range[1] ) ;
  
  auto count = source.shape ( axis ) ; 

  /// we use a buffer of count value_types

  vigra::MultiArray < 1 , math_type > buffer ( count ) ;

  // avoiding being specific about the iterator's type allows us to slot in
  // any old iterator we can get by calling begin() on buffer 
  
  typedef decltype ( buffer.begin() ) iter_type ;
  typedef filter < iter_type , iter_type , math_type > filter_type ;
  filter_type solver ( count , gain , bc , nbpoles , pole , tolerance ) ;

  // next slice is this far away:

  auto source_stride = source.stride ( axis ) ;

  auto source_base_adress = source.data() ;
  auto buffer_base_adress = buffer.data() ;
  auto target_base_adress = target.data() ;

  if ( source.stride() == target.stride() )
  {
    // we already know that both arrays have the same shape. If the strides are also the same,
    // both arrays have the same structure in memory.
    // If both arrays have the same structure, we can save ourselves the index calculations
    // for the second array, since the indices would come out the same. target_base_adress
    // may be the same as source_base_adress, in which case the operation is in-place, but
    // we can't derive any performance benefit from the fact.
    
    // TODO: doublecheck if there really is a performance benefit from the 'shared'
    // indexes. using the else case below for all situations would simplify the code.

    // pick the first slice of source along the processing axis

    auto source_slice = source.bindAt ( axis , 0 ) ;

    // we permute the slice's strides to ascending order to make the memory access
    // as efficient as possible.

    auto permuted_slice = source_slice.permuteStridesAscending() ;
    
    // we iterate over the elements in this slice - not to access them, but to
    // calculate their offset from the first one. This may not be the most efficient
    // way but it's simple and foolproof and will only be needed once per count values.

    auto source_sliter = permuted_slice.begin() ;
    auto source_sliter_end = permuted_slice.end() ;

    while ( source_sliter < source_sliter_end )
    {
      // copy from the array to the buffer with a monadic gather, casting to
      // math_type in the process
      
      auto source_index = &(*source_sliter) - source_base_adress ;
      
      gather < source_type , math_type > ( source_base_adress + source_index ,
                                           buffer_base_adress ,
                                           source_stride ,
                                           count ) ;
                              
      // finally (puh): apply the prefilter, using the solver in-place, iterating over
      // the vectors in buffer with maximum efficiency.
                              
      solver.solve ( buffer.begin() ) ;
      
      // and perform a monadic scatter to write the filtered data to the destination,
      // casting to target_type in the process

      scatter< math_type , target_type > ( buffer_base_adress ,
                                           target_base_adress + source_index ,
                                           source_stride ,
                                           count ) ;
      ++source_sliter ;
    }
  }
  else
  {
    // pretty much the same as the previouse operation, with the distinction that
    // copying the filtered data from the buffer to the target now needs it's own
    // index etc., since all these may be different.

    // TODO we might permute source_slice's strides to ascending and apply the same
    // permutation to target_slice.
    
    auto source_slice = source.bindAt ( axis , 0 ) ;
    auto source_sliter = source_slice.begin() ;
    auto source_sliter_end = source_slice.end() ;

    auto target_slice = target.bindAt ( axis , 0 ) ;
    auto target_stride = target.stride ( axis ) ;
    auto target_sliter = target_slice.begin() ;

    while ( source_sliter < source_sliter_end )
    {
      auto source_index = &(*source_sliter) - source_base_adress ;
      auto target_index = &(*target_sliter) - target_base_adress ;
      
      gather < source_type , math_type > ( source_base_adress + source_index ,
                                           buffer_base_adress ,
                                           source_stride ,
                                           count ) ;
                                           
      solver.solve ( buffer.begin() ) ;
      
      scatter< math_type , target_type > ( buffer_base_adress ,
                                           target_base_adress + target_index ,
                                           target_stride ,
                                           count ) ;
      ++source_sliter ;
      ++target_sliter ;
    }
  }
}

// the use of Vc has to be switched on with the flag USE_VC.
// before we can code the vectorized analogon of nonaggregating_filter, we need
// some more infrastructure code:

#ifdef USE_VC

/// extended gather and scatter routines taking 'extrusion parameters'
/// which handle how many times and with which stride the gather/scatter
/// operation is repeated. With these routines, strided memory can be
/// copied to a compact chunk of properly aligned memory and back.
/// The gather routine gathers from source, which points to strided memory,
/// and deposits in target, which is compact.
/// The scatter routine scatters from source, which points to compact memory,
/// and deposits in target, which points to strided memory.
/// Initially I coded using load/store operations to access the 'non-compact'
/// memory as well, if the indexes were contiguous, but surprisingly, this was
/// slower. I like the concise expression with this code - instead of having
/// variants for load/store vs. gather/scatter and masked/unmasked operation,
/// the modus operandi is determined by the indices and mask passed, which is
/// relatively cheap as it occurs only once, while the inner loop can just
/// rip away.
/// per default, the type used for gather/scatter indices (gs_indexes_type)
/// will be what Vc deems appropriate. This comes out as an SIMD type composed
/// of int, and ought to result in the fastest code on the machine level.
/// But since the only *requirement* on gather/scatter indices is that they
/// offer a subscript operator (and hold enough indices), other types can be
/// used as gs_indexes_type as well. Below I make the disticzion and pass in
/// a TinyVector of ptrdiff_t if int isn't sufficiently large to hold the
/// intended indices. On my system, this is actually faster.

template < typename source_type ,     // (singular) source type
           typename target_type ,     // (simdized) target type
           typename index_type ,      // (singular) index type for stride, count
           typename gs_indexes_type > // type for gather/scatter indices
void
gather ( const source_type * source ,
         target_type * target ,
         const gs_indexes_type & indexes ,
         const typename target_type::Mask & mask ,
         const index_type & stride ,
         index_type count
       )
{
  // fix the type into which to gather source data
  enum { vsize = target_type::Size } ;
  typedef typename Vc::SimdArray < source_type , vsize > simdized_source_type ;

  // if the mask is all-true, load the data with an unmasked gather operation
  if ( mask.isFull() )
  {
    while ( count-- )
    {
// while Vc hadn't yet implemented gathering using intrinsics (for AVX2)
// I played with using tem directly to see if I could get better performance.
// So far it looks like as if the prefiltering code doesn't benefit.
//       __m256i ix = _mm256_loadu_si256 ( (const __m256i *)&(indexes) ) ;
//       __m256 fv = _mm256_i32gather_ps (source, ix, 4) ;
      simdized_source_type x ( source , indexes ) ;
      * target = target_type ( x ) ;
      source += stride ;
      ++ target ;
    }
  }
  else
  {
    // if there is a partially filled mask, perform a masked gather operation
    while ( count-- )
    {
      simdized_source_type x ( source , indexes , mask ) ;
      * target = target_type ( x ) ;
      source += stride ;
      ++ target ;
    }
  }
}

template < typename source_type ,     // (simdized) source type
           typename target_type ,     // (singular) target type
           typename index_type ,      // (singular) index type for stride, count
           typename gs_indexes_type > // type for gather/scatter indices
void
scatter ( const source_type * source ,
          target_type * target ,
          const gs_indexes_type & indexes ,
          const typename source_type::Mask & mask ,
          const index_type & stride ,
          index_type count
        )
{
  // fix the type from which to scatter target data
  enum { vsize = source_type::Size } ;
  typedef typename Vc::SimdArray < target_type , vsize > simdized_target_type ;

  // if the mask is full, deposit with an unmasked scatter
  if ( mask.isFull() )
  {
    while ( count-- )
    {
      simdized_target_type x ( *source ) ;
      x.scatter ( target , indexes ) ;
      ++ source ;
      target += stride ;
    }
  }
  else
  {
    // if there is a partially filled mask, perform a masked scatter operation
    while ( count-- )
    {
      simdized_target_type x ( *source ) ;
      x.scatter ( target , indexes , mask ) ;
      ++ source ;
      target += stride ;
    }
  }
}

/// aggregating_filter keeps a buffer of vector-aligned memory, which it fills from
/// vsize 1D subarrays of the source array which are collinear to the processing axis.
/// Note that the vectorization, or aggregation axis is *orthogonal* to the processing
/// axis, since the adjacency of neighbours along the processing axis needs to be
/// preserved for filtering.
/// The buffer is then submitted to vectorized forward-backward recursive filtering
/// and finally stored back to the corresponding memory area in target, which may
/// be the same as source, in which case the operation is seemingly performed
/// in-place (while in fact the buffer is still used). Buffering takes the bulk
/// of the processing time (on my system), the vectorized maths are fast by
/// comparison. Depending on data type, array size and spline degree, sometimes the
/// nonvectorized code is faster. But as both grow, bufering soon comes out on top.
/// ele_aggregating_filter is a subroutine processing arrays of elementary value_type.
/// It's used by aggregating_filter, after element-expanding the array(s).
/// With this vectorized routine and the size of gather/scatter indices used by Vc
/// numeric overflow could occur: the index type is only int, while it's assigned a
/// ptrdiff_t, which it may not be able to represent. The overflow can happen when
/// a gather/scatter spans a too-large memory area. The gather/scatter indices will
/// be set up so that the first index is always 0 (by using the adress of the first
/// storee, not the array base adress), but even though this makes it less likely for
/// the overflow to occur, it still can happen. In this case the code falls back
/// to using a vigra::TinyVector < ptrdiff_t > as gather/scatter index type, which
/// may cause Vc to use less performant code for the gather/scatter operations but
/// is safe.
// TODO: using different vsize for different axes might be faster.

template < typename source_view_type ,
           typename target_view_type ,
           typename math_type >
void ele_aggregating_filter ( source_view_type &source ,
                              target_view_type &target ,
                              int axis ,
                              double gain ,
                              bc_code bc ,
                              int nbpoles ,
                              const long double * pole ,
                              double tolerance
                            )
{
  // for prefiltering, using Vc::Vectors seems faster than using SimdArrays of twice the size,
  // which are used as simdized type in evaluation

  const int vsize = vspline::vector_traits < math_type > :: rsize ;
  typedef typename vspline::vector_traits < math_type , vsize > :: type simdized_math_type ;
  
  typedef typename source_view_type::value_type source_type ;
  
  typedef typename vspline::vector_traits < source_type > :: type simdized_source_type ;
  
  typedef typename target_view_type::value_type target_type ;
  
  typedef typename vspline::vector_traits < target_type > :: type simdized_target_type ;
  
  // indexes for gather/scatter. first the 'optimal' type, which Vc produces as
  // the IndexType for simdized_math_type. Next a wider type composed of std::ptrdiff_t,
  // to be used initially when calculating the indices, and optionally later for the
  // actual gather/scatter operations if gs_indexes_type isn't wide enough.

  typedef typename simdized_math_type::IndexType gs_indexes_type ;
  typedef vigra::TinyVector < std::ptrdiff_t , vsize > comb_type ;
  
  // mask type for masked operation
  typedef typename simdized_math_type::MaskType mask_type ;
  
  auto count = source.shape ( axis ) ; // number of vectors we'll process

  // I initially tried to use Vc::Memory, but the semantics of the iterator obtained
  // by buffer.begin() didn't work for me.
  // anyway, a MultiArray with the proper allocator works just fine, and the dereferencing
  // of the iterator needed in the solver works without further ado. 
  
  vigra::MultiArray < 1 , simdized_math_type , Vc::Allocator<simdized_math_type> >
    buffer ( count ) ;

  // avoiding being specific about the iterator's type allows us to slot in
  // any old iterator we can get by calling begin() on buffer 
  
  typedef decltype ( buffer.begin() ) viter_type ;

  // set of offsets into the source slice which will be used for gather/scatter

  comb_type source_indexes ;
  
  // while we don't hit the last odd few 1D subarrays the mask is all-true

  mask_type mask ( true ) ;
  
  // next slice is this far away:

  auto source_stride = source.stride ( axis ) ;

  // we want to use the extended gather/scatter (with 'extrusion'), so we need the
  // source and target pointers. Casting buffer's data pointer to math_type is safe,
  // Since the simdized_type objects stored there are merely raw math_type data
  // in disguise.

  auto source_base_adress = source.data() ;
  auto buffer_base_adress = buffer.data() ;
  auto target_base_adress = target.data() ;

  gs_indexes_type source_gs_indexes ;
  gs_indexes_type target_gs_indexes ;      

  // we create a solver object capable of handling the iterator producing the successive
  // simdized_types from the buffer. While the unvectorized code can omit passing the third
  // template argument (the elementary type used inside the solver) we pass it here, as we
  // don't define an element-expansion via vigra::ExpandElementResult for simdized_type.

  typedef filter < viter_type , viter_type , math_type > filter_type ;
  filter_type solver ( count , gain , bc , nbpoles , pole , tolerance ) ;

  if ( source.stride() == target.stride() )
  {
    // we already know that both arrays have the same shape. If the strides are also the same,
    // both arrays have the same structure in memory.
    // If both arrays have the same structure, we can save ourselves the index calculations
    // for the second array, since the indexes would come out the same. target_base_adress
    // may be the same as source_base_adress, in which case the operation is in-place, but
    // we can't derive any performance benefit from the fact.

    // pick the first slice of source along the processing axis

    auto source_slice = source.bindAt ( axis , 0 ) ;

    // we permute the slice's strides to ascending order to make the memory access
    // as efficient as possible.

    auto permuted_slice = source_slice.permuteStridesAscending() ;
    
    // we iterate over the elements in this slice - not to access them, but to
    // calculate their offset from the first one. This may not be the most efficient
    // way but it's simple and foolproof and will only be needed once per count values.

    auto source_sliter = permuted_slice.begin() ;
    auto source_sliter_end = permuted_slice.end() ;
    
    while ( source_sliter < source_sliter_end )
    {
      // try loading vsize successive offsets into an comb_type
      
      int e ;
      
      // we base the operation so that the first entry in source_indexes
      // will come out 0.
  
      auto first_source_adress = &(*source_sliter) ;
      auto offset = first_source_adress - source_base_adress ;
      auto first_target_adress = target_base_adress + offset ;
      
      for ( e = 0 ; e < vsize && source_sliter < source_sliter_end ; ++e , ++source_sliter )
        
        source_indexes[e] = &(*source_sliter) - first_source_adress ;
      
      if ( e < vsize )
        
        // have got less than vsize? must be the last few items.
        // mask was all-true before, so now we limit it to the first e fields:
        
        mask = ( simdized_math_type::IndexesFromZero() < e ) ;

      // next we assign the indices (which are ptrdiff_t) to the intended type
      // for gather/scatter indices - which is what Vc deems appropriate. This should
      // be the optimal choice in terms of performance. Yet we can't be certain that
      // the ptrdiff_t values actually fit into this type, which is usually composed of
      // int only. So we test if the assigned value compares equal to the assignee.
      // If the test fails for any of the indices, we switch to code using a
      // vigra::TinyVector < ptrdiff_t > for the indices, which is permissible, since
      // TinyVector offers operator[], but may be less efficient.
      // Note: Vc hasn't implemented the gather with intrinsics for AVX2, that's why
      // using gs_indexes_type can't yet have a speedup effect.
      // Note: since the gathers are often from widely spaced locations, there is
      // not too much benefit to be expected.
      
      bool fits = true ;
      for ( e = 0 ; fits && ( e < vsize ) ; e++ )
      {
        source_gs_indexes[e] = source_indexes[e] ;
        if ( source_gs_indexes[e] != source_indexes[e] )
          fits = false ;
      }
      
      if ( fits )
      {
        // perform extended gather with extrusion parameters to transport the unfiltered data
        // to the buffer, passing in source_gs_indexes for best performance.
        
        gather
          ( first_source_adress ,
            buffer_base_adress ,
            source_gs_indexes ,
            mask ,
            source_stride ,
            count ) ;
                                
        // finally (puh): apply the prefilter, using the solver in-place, iterating over
        // the vectors in buffer with maximum efficiency.
                                
        solver.solve ( buffer.begin() ) ;
        
        // and perform extended scatter with extrusion parameters to write the filtered data
        // to the destination

        scatter
          ( buffer_base_adress ,
            first_target_adress ,
            source_gs_indexes ,
            mask ,
            source_stride ,
            count ) ;
      }
      else
      {
        // Since the indices did not fit into the optimal type for gather/scatter
        // indices, we pass in a wider type, which may reduce performance, but is
        // necessary under the circumstances. But this should rarely happen:
        // it would mean a gather/scatter spanning several GB.
        
        gather
          ( first_source_adress ,
            buffer_base_adress ,
            source_indexes ,
            mask ,
            source_stride ,
            count ) ;
                                
        solver.solve ( buffer.begin() ) ;
        
        scatter
          ( buffer_base_adress ,
            first_target_adress ,
            source_indexes ,
            mask ,
            source_stride ,
            count ) ;
      }
    }
  }
  else
  {
    // pretty much the same as the if(...) case, with the distinction that copying
    // the filtered data from the buffer to the target now needs it's own set of
    // indexes etc., since all these may be different.

    // TODO we might permute source_slice's strides to ascending and apply the same
    // permutation to target_slice.
    
    auto source_slice = source.bindAt ( axis , 0 ) ;
    auto source_sliter = source_slice.begin() ;
    auto source_sliter_end = source_slice.end() ;

    auto target_slice = target.bindAt ( axis , 0 ) ;
    auto target_stride = target.stride ( axis ) ;
    auto target_sliter = target_slice.begin() ;
    comb_type target_indexes ;

    while ( source_sliter < source_sliter_end )
    {
      int e ;
      auto first_source_adress = &(*source_sliter) ;
      auto first_target_adress = &(*target_sliter) ;
      
      for ( e = 0 ;
           e < vsize && source_sliter < source_sliter_end ;
           ++e , ++source_sliter , ++target_sliter )
      {
        source_indexes[e] = &(*source_sliter) - first_source_adress ;
        target_indexes[e] = &(*target_sliter) - first_target_adress ;
      }
      if ( e < vsize )
        mask = ( simdized_math_type::IndexesFromZero() < e ) ;
      
      // similar code here for the idexes, see notes above.

      bool fits = true ;
      for ( e = 0 ; fits && ( e < vsize ) ; e++ )
      {
        source_gs_indexes[e] = source_indexes[e] ;
        target_gs_indexes[e] = target_indexes[e] ;
        if (    source_gs_indexes[e] != source_indexes[e]
             || target_gs_indexes[e] != target_indexes[e] )
          fits = false ;
      }

      if ( fits )
      {
        gather
          ( first_source_adress ,
            buffer_base_adress ,
            source_gs_indexes ,
            mask ,
            source_stride ,
            count ) ;
        solver.solve ( buffer.begin() ) ;
        scatter
          ( buffer_base_adress ,
            first_target_adress ,
            target_gs_indexes ,
            mask ,
            target_stride ,
            count ) ;
      }
      else
      {
        gather
          ( first_source_adress ,
            buffer_base_adress ,
            source_indexes ,
            mask ,
            source_stride ,
            count ) ;
        solver.solve ( buffer.begin() ) ;
        scatter
          ( buffer_base_adress ,
            first_target_adress ,
            target_indexes ,
            mask ,
            target_stride ,
            count ) ;
      }
    }
  }
}

/// here we provide a common routine 'aggregating_filter', which works for elementary
/// value_types and also for aggregate value_types. Processing is different for these
/// two cases, because the vector code can only process elementary types, and if
/// value_type isn't elementary, we need to element-expand the source and target
/// arrays. Since this routine is the functor passed to multithread() and therefore
/// receives a range parameter to pick out a subset of the data to process in the
/// single thread, we also take the opportunity here to pick out the subarrays
/// for further processing.

template < class source_type ,
           class target_type ,
           typename math_type >
void aggregating_filter ( range_type < typename source_type::difference_type > range ,
                          source_type * p_original_source ,
                          target_type * p_original_target ,
                          int axis ,
                          double gain ,
                          bc_code bc ,
                          int nbpoles ,
                          const long double * pole ,
                          double tolerance
                        )
{
  const int dim = source_type::actual_dimension ;
  typedef typename source_type::value_type value_type ;
  static_assert ( std::is_same < value_type , typename target_type::value_type > :: value ,
    "aggregating_filter: both arrays must have the same value_type" ) ;
  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;

  // continue processing on the subarrays of source and target specified by 'range':

  auto source = p_original_source->subarray ( range[0] , range[1] ) ;
  auto target = p_original_target->subarray ( range[0] , range[1] ) ;
  
  // value_type may be an aggregate type, but we want to operate on elementary types
  // so we element-expand the array and call ele_aggregating_filter, which works on
  // arrays with elementary types. If value_type is elementary already, the call to
  // expandElements inserts a singleton dimension, but this has next to no performance
  // impact, so contrary to my initial implementation I don't handle the 1-channel
  // case separately any more.

  auto expanded_source = source.expandElements ( 0 ) ;
  auto expanded_target = target.expandElements ( 0 ) ;

  // with the element-expanded arrays at hand, we can now delegate to ele_aggregating_filter:
  
  ele_aggregating_filter < decltype ( expanded_source ) ,
                           decltype ( expanded_target ) ,
                           math_type >
              ( expanded_source ,
                expanded_target ,
                axis + 1 ,
                gain ,
                bc ,
                nbpoles ,
                pole ,
                tolerance ) ;
}

#else

// just need the 'hull' for the compiler, will never be called

template < class source_type ,
           class target_type ,
           typename math_type >
void aggregating_filter ( )
{
  assert ( false ) ;
} ;

#endif

/// Now we have the routines which perform the buffering and filtering for a chunk of data,
/// We add code for multithreading. This is done by using utility code from multithread.h.
///
/// Note the template parameter 'is_1d': we use specialized code for 1D arrays, see below.
/// Since there *is* a specialization for is_1d == true_type, this variant will only be
/// called if is_1d == false_type

template < int dim ,
           typename is_1d ,
           typename input_array_type ,  ///< type of array with knot point data
           typename output_array_type , ///< type of array for coefficients (may be the same)
           typename math_type ,         ///< real data type used for calculations inside the filter
           int rsize = vspline::vector_traits < math_type > :: rsize >
class filter_1d
{
public:
  void operator() ( input_array_type &input ,    ///< source data. can also operate in-place,
                    output_array_type &output ,  ///< where input == output.
                    int axis ,
                    double gain ,
                    bc_code bc ,                 ///< boundary treatment for this solver
                    int nbpoles ,
                    const long double * pole ,
                    double tolerance ,
                    int njobs = default_njobs )  ///< number of jobs to use when multithreading
{
  typedef typename input_array_type::value_type value_type ;

  // depending on whether Vc is used or not, we choose the appropriate (single-threaded)
  // filtering routine, which is to be passed to multitheread()

  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;

  auto pf = & aggregating_filter < input_array_type ,
                                   output_array_type ,
                                   ele_type > ;

  // obtain a partitioning of the data array into subranges. We do this 'manually' here
  // because we must instruct shape_splitter not to chop up the current processing axis
  // (by passing axis as the 'forbid' parameter)

  auto partitioning = shape_splitter<dim>::part ( input.shape() , njobs , axis ) ;
  
  // now use multithread() to distribute ranges of data to individual jobs which are
  // executed by the it's thread pool.
  
  multithread ( pf ,
                partitioning ,
                &input ,
                &output ,
                axis ,
                gain ,
                bc ,
                nbpoles ,
                pole ,
                tolerance ) ;
}
} ;

/// specialization for rsize == 1, no vectorization is used even if Vc is available.
/// we need to specify is_1d == false_type - the specialization below for is_1d == true_type,
/// is meant to catch all 1D cases.

template < int dim ,
           typename input_array_type ,  ///< type of array with knot point data
           typename output_array_type , ///< type of array for coefficients (may be the same)
           typename math_type           ///< real data type used for calculations inside the filter
           >
class filter_1d < dim , std::false_type , input_array_type , output_array_type , math_type , 1 >
{
public:
  void operator() ( input_array_type &input ,    ///< source data. can also operate in-place,
                    output_array_type &output ,  ///< where input == output.
                    int axis ,
                    double gain ,
                    bc_code bc ,                 ///< boundary treatment for this solver
                    int nbpoles ,
                    const long double * pole ,
                    double tolerance ,
                    int njobs = default_njobs )  ///< number of jobs to use when multithreading
{
  typedef typename input_array_type::value_type value_type ;

  // depending on whether Vc is used or not, we choose the appropriate (single-threaded)
  // filtering routine, which is to be passed to multitheread()

  auto pf = & nonaggregating_filter < input_array_type ,
                                      output_array_type ,
                                      value_type > ;

  // obtain a partitioning of the data array into subranges. We do this 'manually' here
  // because we must instruct shape_splitter not to chop up the current processing axis
  // (by passing axis as the 'forbid' parameter)

  auto partitioning = shape_splitter<dim>::part ( input.shape() , njobs , axis ) ;
  
  // now use multithread() to distribute ranges of data to individual jobs which are
  // executed by the thread pool.
  
  multithread ( pf ,
                partitioning ,
                &input ,
                &output ,
                axis ,
                gain ,
                bc ,
                nbpoles ,
                pole ,
                tolerance ) ;
}
} ;

/// now here's the specialization for *1D arrays*. It may come as a surprise that it looks
/// nothing like the nD routine. This is due to the fact that we follow a specific strategy:
/// We 'fold up' the 1D array into a 'fake 2D' array, process this 2D array with the nD code
/// which is very efficient, and 'mend' the stripes along the margins of the fake 2D array
/// which contain wrong results due to the fact that some boundary condition appropriate
/// for the 2D case was applied.
/// With this 'cheat' we can handle 1D arrays with full multithreading and vectorization,
/// while the 'orthodox' approach would have to process the data in linear order with
/// a single thread. Cleaning up the 'dirty' margins is cheap for large arrays.
/// The code is making guesses as to whether it's worth while to follow this strategy;
/// the array has to be 'quite large' before 'fake 2D processing' is actually applied.

template < typename input_array_type ,  ///< type of array with knot point data
           typename output_array_type , ///< type of array for coefficients (may be the same)
           typename math_type ,         ///< type for calculations inside filter
           int rsize >
class filter_1d < 1 ,                   // specialize for 1D
                  std::true_type ,      // specialize for is_1d == true_type
                  input_array_type ,
                  output_array_type ,
                  math_type ,
                  rsize >
{
public:
  void operator() ( input_array_type &input ,    ///< source data. can operate in-place
                    output_array_type &output ,  ///< where input == output.
                    int axis ,
                    double gain ,
                    bc_code bc ,                 ///< boundary treatment for this solver
                    int nbpoles ,
                    const long double * pole ,
                    double tolerance ,
                    int njobs = default_njobs )  ///< number of jobs to use
{
  typedef typename input_array_type::value_type value_type ;
  typedef decltype ( input.begin() ) input_iter_type ;
  typedef decltype ( output.begin() ) output_iter_type ;
  typedef vspline::filter < input_iter_type , output_iter_type , double > filter_type ;
  typedef typename vigra::ExpandElementResult < value_type > :: type ele_type ;

  if ( nbpoles <= 0 )
  {
    // nbpoles == 0 means we're prefiltering for a degree 0 or 1 spline.
    // so we don't need to filter anything, but if we're not operating
    // in-place, we want the input copied to the output.
    // we use the simple single-threaded implementation here for now.
    // this should be memory-bound, so multithreading it might be futile
    // TODO: might use multithreaded code to copy input to otput, test
    
    auto it1 = input.begin() ;
    auto it2 = output.begin() ;
    
    void * pi = &(*(it1)) ;
    void * po = &(*(it2)) ;
    
    // if operation isn't in-place
    if ( pi != po )
    {    
      // copy input to output
      auto ie = input.end() ;
      while ( it1 != ie )
      {
        *it2 = *it1 ;
        ++it1 ;
        ++it2 ;
      }
    }
    return ; // return prematurely, saving us an else clause
  }

  const int bands = vigra::ExpandElementResult < value_type > :: size ;
  int runup ;

  // if we can multithread, start out with as many lanes as the desired number of threads

  int lanes = njobs ;
  
#ifdef USE_VC
 
  const int vsize = vector_traits < ele_type > :: size ;
//   const int vsize = Vc::Vector < ele_type > :: Size ;
  
  // if we can use vector code, the number of lanes is multiplied by the
  // number of elements a simdized type inside the vector code can handle

  lanes *= vsize ;

#endif

  // we give the filter some space to run up to precision
  
  if ( tolerance <= 0.0 )
  {
    // we can't use the fake_2d method if the tolerance is 0.0
    lanes = 1 ;
  }
  else
  {
    // there are minimum requirements for using the fake 2D filter. First find
    // the horizon at the given tolerance
    
    int horizon = ceil ( log ( tolerance ) / log ( fabs ( pole[0] ) ) ) ;
    
    // this is just as much as we want for the filter to run up to precision
    // starting with BC code 'ZEROPAD' at the margins
    
    runup = horizon ;
    
    // the absolute minimum to successfully run the fake 2D filter is this:
    // TODO we might rise the threshold, min_length, here
    
    int min_length = 4 * runup * lanes + 2 * runup ;
    
    // input is too short to bother with fake 2D, just single-lane it
    
    if ( input.shape(0) < min_length )
    {
      lanes = 1 ;
    }
    else
    {
      // input is larger than the absolute minimum, maybe we can even increase
      // the number of lanes some more? we'd like to do this if the input is
      // very large, since we use buffering and don't want the buffers to become
      // overly large. But the smaller the run along the split x axis, the more
      // incorrect margin values we have to mend, so we need a compromise.
      // assume a 'good' length for input: some length where further splitting
      // would not be wanted anymore. TODO: do some testing, find a good value
      
      int good_length = 64 * runup * lanes + 2 * runup ;
      
      int split = 1 ;
      
      // suppose we split input.shape(0) in ( 2 * split ) parts, is it still larger
      // than this 'good' length? If not, leave split factor as it is.
      
      while ( input.shape(0) / ( 2 * split ) >= good_length )
      {  
        // if yes, double split factor, try again
        split *= 2 ;
      }
      
      lanes *= split ; // increase number of lanes by additional split
    }
    
  }
  
  // if there's only one lane we just use this simple code:

  if ( lanes == 1 )
  {
    // this is a simple single-threaded implementation
    filter_type solver ( input.shape(0) ,
                         gain ,
                         bc ,
                         nbpoles ,
                         pole ,
                         0.0 ) ;
    solver.solve ( input.begin() , output.begin() ) ;
    return ; // return prematurely, saving us an else clause
  }
  
  // the input qualifies for fake 2D processing.

//   std::cout << "fake 2D processing with " << lanes << " lanes" << std::endl ;
  
  // we want as many chunks as we have lanes. There may be some data left
  // beyond the chunks (tail_size of value_type)
  
  int core_size = input.shape(0) ;
  int chunk_size = core_size / lanes ;
  core_size = lanes * chunk_size ;
  int tail_size = input.shape(0) - core_size ;
  
  // just doublecheck

  assert ( core_size + tail_size == input.shape(0) ) ;
  
  // now here's the strategy: we treat the data as if they were 2D. This will
  // introduce errors along the 'vertical' margins, since there the 2D treatment
  // will start with some boundary condition along the x axis instead of looking
  // at the neighbouring line where the actual continuation is.
  
  // first we deal with the very beginning and end of the signal. This requires
  // special treatment, because here we want the boundary conditions to take
  // effect. So we copy the beginning and end of the signal to a buffer, being
  // generous with how many data we pick. The resulting buffer will have an
  // unusable part in the middle, where tail follows head, but since we've made
  // sure that this location is surrounded by enough 'runup' data, the effect
  // will only be detectable at +/- runup from the point where tail follows head.
  // The beginning of head and the end of tail are at the beginning and end
  // of the buffer, though, so that applying the boundary condition will
  // have the desired effect. What we'll actually use of the buffer is not
  // the central bit with the effects of the clash of head and tail, but
  // only the bits at the ends which aren't affected because they are far enough
  // away.
  
  // note how this code fixes a bug in my initial implementation, which produced
  // erroneous results with periodic splines, because the boundary condition
  // was not properly honoured.
  
  // calculate the sizes of the parts of the signal we'll put into the buffer
  int front = 2 * runup ;
  int back = tail_size + 2 * runup ;
  int total = front + back ;
  
  // create the buffer and copy the beginning and end of the signal into it
  vigra::MultiArray < 1 , value_type > head_and_tail ( total ) ;
  auto target_it = head_and_tail.begin() ;
  auto source_it = input.begin() ;
  for ( int i = 0 ; i < front ; i++ )
  {
    *target_it = *source_it ;
    ++target_it ;
    ++source_it ;
  }
  source_it = input.end() - back ;
  for ( int i = 0 ; i < back ; i++ )
  {
    *target_it = *source_it ;
    ++target_it ;
    ++source_it ;
  }

  // set up the filter for this buffer and apply it
  filter_type head_and_tail_solver ( head_and_tail.size() ,
                                     gain ,
                                     bc ,
                                     nbpoles ,
                                     pole ,
                                     0.0 ) ;
  head_and_tail_solver.solve ( head_and_tail.begin() ) ;

  // set up two MultiArrayViews corresponding to the portions of the data
  // we copied into the buffer. The first bit of 'head' and the last bit
  // of 'tail' hold valid data and will be used further down.

  vigra::MultiArrayView < 1 , value_type > head
    ( vigra::Shape1 ( front ) , head_and_tail.data() ) ;

  vigra::MultiArrayView < 1 , value_type > tail
    ( vigra::Shape1 ( back ) , head_and_tail.data() + front ) ;
  
  // end of bug fix for periodic splines

  // head now has runup correct values at the beginning, succeeded by runup invalid
  // values, and tail has tail_size + runup correct values at the end, preceded by
  // runup values which aren't usable.

  // now we create a fake 2D view to the margin of the data. Note how we let the
  // view begin 2 * runup before the end of the first line, capturing the 'wraparound'
  // right in the middle of the view
  
  typedef vigra::MultiArrayView < 2 , value_type > fake_2d_type ;
  
  fake_2d_type
    fake_2d_margin ( vigra::Shape2 ( 4 * runup , lanes - 1 ) ,
                     vigra::Shape2 ( input.stride(0) , input.stride(0) * chunk_size ) ,
                     input.data() + chunk_size - 2 * runup ) ;
 
  // again we create a buffer and filter into the buffer

  vigra::MultiArray < 2 , value_type > margin_buffer ( fake_2d_margin.shape() ) ;
  
  filter_1d < 2 , std::false_type , fake_2d_type , fake_2d_type , math_type > ()
    ( fake_2d_margin ,
      margin_buffer ,
      0 ,
      gain ,
      GUESS ,
      nbpoles ,
      pole ,
      tolerance ,
      1 ) ;
 
  // now we have filtered data for the margins in margin_buffer, of which the central half
  // is usable, the remainder being runup data which we'll ignore. Here's a view to the
  // central half:
  
  vigra::MultiArrayView < 2 , value_type > margin
  = margin_buffer.subarray ( vigra::Shape2 ( runup , 0 ) ,
                             vigra::Shape2 ( 3 * runup , lanes - 1 ) ) ;
  
  // we already create a view to the target array's margin which we intend to overwrite,
  // but the data will only be copied in from margin after the treatment of the core.

  vigra::MultiArrayView < 2 , value_type >
    margin_target ( vigra::Shape2 ( 2 * runup , lanes - 1 ) ,
                    vigra::Shape2 ( output.stride(0) , output.stride(0) * chunk_size ) ,
                    output.data() + chunk_size - runup ) ;
                    
  // next we fake a 2D array from input and filter it to output, this may be an
  // in-place operation, since we've extracted all margin information earlier and
  // deposited what we need in buffers
  
  fake_2d_type
    fake_2d_source ( vigra::Shape2 ( chunk_size , lanes ) ,
                     vigra::Shape2 ( input.stride(0) , input.stride(0) * chunk_size ) ,
                     input.data() ) ;

  fake_2d_type
    fake_2d_target ( vigra::Shape2 ( chunk_size , lanes ) ,
                     vigra::Shape2 ( output.stride(0) , output.stride(0) * chunk_size ) ,
                     output.data() ) ;
  
  // now we filter the fake 2D source to the fake 2D target

  filter_1d < 2 , std::false_type , fake_2d_type , fake_2d_type , math_type > ()
    ( fake_2d_source ,
      fake_2d_target ,
      0 ,
      gain ,
      GUESS ,
      nbpoles ,
      pole ,
      tolerance ,
      njobs ) ;

  // we now have filtered data in target, but the stripes along the magin
  // in x-direction (1 runup wide) are wrong, because we applied GUESS BC.
  // this is why we have the data in 'margin', and we now copy them to the
  // relevant section of 'target'
               
  margin_target = margin ;
  
  // finally we have to fix the first and last few values, which weren't touched
  // by the margin operation (due to margin's offset and length)
  
  typedef vigra::Shape1 dt ;
  
  output.subarray ( dt(0) , dt(runup) )
    = head.subarray ( dt(0) , dt(runup) ) ;

  output.subarray ( dt(output.size() - tail_size - runup ) , dt(output.size()) )
    = tail.subarray ( dt(tail.size() - tail_size - runup ) , dt(tail.size()) ) ;
}
} ;

/// This routine calls the 1D filtering routine for all axes in turn. This is the
/// highest-level routine in filter.h, and the only routine used by other code in
/// vspline. It has no code specific to b-splines, any set of poles will be processed.
/// To use this routine for b-splines, the correct poles have to be passed in, which
/// is done in prefilter.h, where the code for prefiltering the knot point data
/// calls filter_nd with the poles needed for a b-spline.
///
/// This routine takes the following parameters:
///
/// - input, output: MultiArrayViews of the source and target array
/// - bc: TinyVector of boundary condition codes, allowing separate values for each axis
/// - nbpoles: number of filter poles
/// - pole: pointer to nbpoles long doubles containing the filter poles
/// - tolerance: acceptable error
/// - njobs: number of jobs to use when multithreading

// TODO look into treatment of singleton dimensions

template < typename input_array_type ,  // type of array with knot point data
           typename output_array_type , // type of array for coefficients (may be the same)
           typename math_type >         // type used for arithmetic operations in filter
void filter_nd ( input_array_type & input ,
                 output_array_type & output ,
                 vigra::TinyVector<bc_code,input_array_type::actual_dimension> bc ,
                 int nbpoles ,
                 const long double * pole ,
                 double tolerance ,
                 int njobs = default_njobs )
{
  // check if operation is in-place. I assume that the test performed here
  // is sufficient to determine if the operation is in-place.
  
  bool in_place = false ;
  
  if ( (void*)(input.data()) == (void*)(output.data()) )
    in_place = true ;

  // if input == output, with degree <= 1 we needn't do anything at all.
  
  if ( in_place && nbpoles < 1 )
    return ;

  // do a bit of compatibility checking
  
  const int dim = input_array_type::actual_dimension ;
  
  if ( output_array_type::actual_dimension != dim )
  {
    throw dimension_mismatch ( "input and output array must have the same dimension" ) ;
  }
  
  typedef typename input_array_type::difference_type diff_t ;
  diff_t shape = input.shape() ;
  if ( output.shape() != shape )
  {
    throw shape_mismatch ( "input and output array must have the same shape" ) ;
  }

  // normally the gain is the same for all dimensions.

  double gain_d0 = overall_gain ( nbpoles , pole ) ;
  double gain_dn = gain_d0 ;

  // deactivating the code below may produce slightly more precise results
  // This bit of code results in applictation of the cumulated gain for all dimensions
  // while processing axis 0, and no gain application for subsequent axes.
  // heuristic. for high degrees, below optimization reduces precision too much
  // TODO: the effect of this optimization seems negligible.
  
  if ( dim > 1 && pow ( nbpoles , dim ) < 32 )
  {
    gain_d0 = pow ( gain_d0 , dim ) ;
    gain_dn = 1.0 ;
  }

  // even if degree <= 1, we'll only arrive here if input != output.
  // So we still have to copy the input data to the output (solve_identity)

  typedef std::integral_constant < bool , dim == 1 > is_1d ;
  
  filter_1d < dim , is_1d , input_array_type , output_array_type , math_type > ()
    ( input ,
      output ,
      0 ,
      gain_d0 ,
      bc[0] ,
      nbpoles ,
      pole ,
      tolerance ,
      njobs ) ;

  // but if degree <= 1 we're done already, since copying the data again
  // in dimensions 1... is futile

  if ( nbpoles > 0 )
  {
    // so for the remaining dimensions we also call the filter.
    for ( int d = 1 ; d < dim ; d++ )
      filter_1d < dim , is_1d , output_array_type , output_array_type , math_type ,
                  vspline::vector_traits < math_type > :: rsize > ()
        ( output ,
          output ,
          d ,
          gain_dn ,
          bc[d] ,
          nbpoles ,
          pole ,
          tolerance ,
          njobs ) ;
  }
}

} ; // namespace vspline

#endif // VSPLINE_FILTER_H
