/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2020 by Kay F. Jahnke                    */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/// \file anytype.cc
///
/// \brief demonstrates the three modes of vectorization:
///        scalar, emulated and with Vc
///
/// compile: clang++ -std=c++11 -o anytype anytype.cc
/// or: clang++ -std=c++11 -DUSE_VC -o anytype anytype.cc -lVc
///
/// Do try out using g++ as well, and use various levels of optimization.
/// When I did these tests, I found no conclusive results with the speed
/// tests which would warrant a clear recommendation. But then, the math
/// used is so trivial that optimization may come up with really clever
/// ways of speeding them up, and surely a lot of time is spent on the
/// memory access. The main point of this program isn't the speed tests,
/// though, but the demonstration of the vectorization modes.

#include <iostream>
#include <ctime>
#include <chrono>

// for this demonstration, we don't want multithreading:

// #define VSPLINE_SINGLETHREAD

#include <vspline/vspline.h>

// we use a type 'test_t' which is 'monolithic', meaning it can't be
// processed by vigra's ExpandElementResult mechanism:

struct test_t
{
  char padding[12] ; // just for 'padding'
} ;

// just so we can << test_t to cout

std::ostream & operator<< ( std::ostream & osr , const test_t & x )
{
  osr << "test_t" ;
  return osr ;
} ;

// We define two functors for this type: one scalar functor, which
// is realized by setting the template argument 'vsize' to 1:

struct scalar_f
: public vspline::unary_functor < test_t , test_t , 1 >
{
  void eval ( const test_t & in , test_t & out ) const
  {
    std::cout << "scalar_f: " << in << std::endl ;
    out = in ;
  }
} ;

// another functor with no specified vsize, resulting in the
// use of the default in simdize_traits. Here we provide an eval
// overload matching vectorized arguments

struct poly_f
: public vspline::unary_functor < test_t , test_t >
{
  void eval ( const test_t & in , test_t & out ) const
  {
    std::cout << "single-value poly_f: " << in << std::endl ;
    out = in ;
  }
  
  template < typename in_t , typename out_t >
  void eval ( const in_t & in , out_t & out ) const
  {
    std::cout << "vectorized poly_f: " << in << std::endl ;
    out = in ;
  }
} ;

// a functor processing 'double', which can be
// processed by Vc if it is present. Note how vspline::unary_functor
// 'looks' at vspline::vector_traits to 'glean' the type of vectorized
// arguments it can process: if Vc is present, the code in vector.h
// finds specializations of 'simd_traits' which 'produce' Vc::SimdArray
// as 'type'. So this type appears in vector_traits and, from there,
// is taken to be the vectorized argument type, which is of the same
// type for input and output, in this case. Since the inheritance from
// vspline::unary_functor is public, we can readily use this type by
// the name 'in_v' and 'out_v'. If the unvectorized and the vectorized
// eval method can share code, the operation can be coded as a member
// function template, see the variant below.

struct double_f
: public vspline::unary_functor < double , double >
{
  void eval ( const double & in , double & out ) const
  {
    std::cout << "unvectorized double_f: " << in << std::endl ;
    out = in ;
  }
  
  void eval ( const in_v & in , out_v & out ) const
  {
    std::cout << "vectorized double_f: "
              << in << std::endl ;
    out = in ;
  }
} ;

// Like in the previous example, all the code actually does (apart from
// <<ing to cout) is an assignment of input to output. So we can share
// the code between the vectorized and unvectorized routine and use a
// member function template. Opportunities for writing vector-agnostic
// code appear reasonably frequently, since the vectorized data types
// are modeled to be usable just like scalars, and as long as the code
// just performs arithmetic or moves data around, we can use uniform
// syntax. If the code lends itself to be shared in this way, we have
// a beautifully simple way of getting (potentially explicitly) vectorized
// processing of the data at hand without even having to be aware of the
// fact: vspline's wielding code will parcel out the data in the array
// into whatever vector type it's told to use, the functor is called
// with vector data which are processed just like scalar data, and after
// the processing is done, vspline's wielding code scatters the result
// back to the target array. If Vc is present and the elementary data
// type can be used to form Vc::SimdArrays, the operation will be done
// with ISA-specific intrinsics, but all the user needs to provide is a
// simple apparently scalar piece of code which is magically 'rolled out'
// into vector code. And if the vector data need syntax which does not
// compute with scalar data, the vectorized routine can be written
// explicitly, like in the previous example.
// Note also that vspline's wielding code has specializations to parcel
// out data into Vc::SimdArrays, using Vc::InterleavedMemoryWrapper,
// making the process even faster. In fact, the resulting code is just the
// same as if the whole operation had been hand-coded, but it's all done
// automatically. And since the code is 'structurally vectorized', not
// using Vc does not even cost much performance: operating on vspline's
// simd_type data type will trigger the compiler's autovectorization, and
// the resulting binary often comes close to explicit vector code: See
// the next example

struct t_double_f
: public vspline::unary_functor < double , double >
{
  template < typename IN , typename OUT >
  void eval ( const IN & in , OUT & out ) const
  {
    std::cout << "t_double_f using eval template: " << in << std::endl ;
    out = in ;
  }
} ;

// Here we have a functor doing some arithmetic, which we'll 'apply'
// to a large array, measuring execution time.

struct sample_math
: public vspline::unary_functor < float >
{
  template < typename dtype >
  void eval ( const dtype & in , dtype & out ) const
  {
    auto twice = in + in ;
    twice *= ( 3.5f + in / 5.5f ) ;
    out = twice - twice / 2.2f ;
  }
} ;

typedef vigra::TinyVector < float , 3 > f3_t ;

struct sample_math3
: public vspline::unary_functor < f3_t >
{
  template < typename dtype >
  void eval ( const dtype & in , dtype & out ) const
  {
    dtype twice ;
    for ( int e = 0 ; e < 3 ; e++ )
    {
      twice[e] = in[e] + in[e] ;
      twice[e] *= ( 3.5f + in[e] / 5.5f ) ;
      out[e] = twice[e] - twice[e] / 2.2f ;
    }
  }
} ;

#ifdef USE_VC

// vector data like Vc::float_v are 'monolithic', since there is no
// element-expansion defined for them. So they can be used as arguments
// for a unary functor, and 'vectorizing' them will gather several of
// them in a simd_type. While this is not used inside vspline and such
// data aren't currently usable in b-splines, it's nice to have, since
// the processing of an array of vector data is automatically
// multithreaded and the vectors are processed in small batches without
// having to form SimdArrays from them.

struct float_v_f
: public vspline::unary_functor < Vc::float_v , Vc::float_v >
{
  void eval ( const Vc::float_v & in , Vc::float_v & out ) const
  {
    out = in + 3.0f ;
    std::cout << "single-value float_v_f: " << out << std::endl ;
  }
  
  template < typename in_t , typename out_t >
  void eval ( const in_t & in , out_t & out ) const
  {
    for ( int e = 0 ; e < vsize ; e++ )
      out[e] = in[e] + 3.0f ;
    std::cout << "vectorized float_v_f: " << out << std::endl ;
  }
} ;

#endif

// using vspline's 'apply' function, we apply the functors to
// arrays holding appropriate data and observe the output. Use of
// scalar_f processes every element singly, the other calls
// perform 'peeling': as long as full vectors can be formed, the
// vectorized eval overload is called, followed by single-element
// processing for the leftovers.
// If the program was compiled with Vc (-DUSE_VC), we can observe
// that the third call actually processes Vc data during the peeling
// stage.

int main ( int argc , char * argv[] )
{
  vigra::MultiArray < 1 , test_t > sa ( 18 ) ;
  vigra::MultiArray < 1 , double > da ( 18 ) ;
  
  vspline::apply ( scalar_f() , sa ) ;
  std::cout << std::endl ;
  
  vspline::apply ( poly_f() , sa ) ;
  std::cout << std::endl ;
  
  vspline::apply ( double_f() , da ) ;
  std::cout << std::endl ;
  
  vspline::apply ( t_double_f() , da ) ;
  
#ifdef USE_VC
 
  // If we have an array with vector data, we can use 'apply' to
  // feed the vector data to a compatible unary_functor:
  
  vigra::MultiArray < 1 , Vc::float_v > fva ( 5 ) ;
  
  std::cout << std::endl ;
  
  vspline::apply ( float_v_f() , fva ) ;
  
#endif
  
  // For the speed test, we set up a 3D array of .256 gigafloat.

  vigra::MultiArray < 3 , float >
         fa ( vigra::Shape3 ( 256 , 1024 , 1024 ) ) ;
  
  // let's use this value for testing:

  float a = 7.7f ;
  float b ;
  
  // applying 'functor' to a, we obtain the result in b
  
  std::cout << std::endl ;
  
  auto functor = sample_math() ;
  functor.eval ( a , b ) ;
  
  std::cout << "functor ( " << a << " ) = " << b << std::endl ;
  
  // we initialize 'fa' with a
  
  fa = a ;
  
  // and run the speed test
  
  std::cout << "speed test with vspline::apply" << std::endl ;
  
  auto start = std::chrono::system_clock::now() ;

  vspline::apply ( functor , fa ) ;
  
  auto end = std::chrono::system_clock::now() ;
  std::cout << "processing array 'fa' took "
            << std::chrono::duration_cast<std::chrono::milliseconds>
                   ( end - start ) . count()
            << " ms" << std::endl ;
  
  // we make sure we're not being fooled by taking a sample

  assert ( fa [ vigra::Shape3 ( 100 , 500 , 500 ) ] == b ) ;
  
  // as a reference, we use 'functor' in a scalar loop. Since the math
  // involved are trivial, the compiler can do very well autovectorizing
  // the operation, so this operation is fast and we can assume that
  // our efforts at vectorization were succesful if the previous test
  // did take roughly the same time. It turns out that this depends
  // very much on the compiler used and the optimization level.
  
  std::cout << "speed test using scalar loop" << std::endl ;
  
  fa = a ;
  
  start = std::chrono::system_clock::now() ;

  float * pf = fa.data() ;
  for ( int i = 0 ; i < fa.size() ; i++ )
    functor.eval ( pf[i] , pf[i] ) ;
  
  end = std::chrono::system_clock::now() ;
  std::cout << "processing array 'fa' took "
            << std::chrono::duration_cast<std::chrono::milliseconds>
                   ( end - start ) . count()
            << " ms" << std::endl ;

  // this actually fails with clang++ and -Ofast:

  assert ( fa [ vigra::Shape3 ( 100 , 500 , 500 ) ] == b ) ;
  
  vigra::MultiArray < 3 , f3_t >
        fa3 ( vigra::Shape3 ( 1024 , 1024 , 256 ) ) ;

  start = std::chrono::system_clock::now() ;
  
  auto functor3 = sample_math3() ;
  vspline::apply ( functor3 , fa3 ) ;
  
  end = std::chrono::system_clock::now() ;
  std::cout << "processing array 'fa3' took "
            << std::chrono::duration_cast<std::chrono::milliseconds>
                   ( end - start ) . count()
            << " ms" << std::endl ;

  start = std::chrono::system_clock::now() ;

  auto pf3 = fa3.data() ;
  for ( int i = 0 ; i < fa3.size() ; i++ )
    functor3.eval ( pf3[i] , pf3[i] ) ;

  end = std::chrono::system_clock::now() ;
  std::cout << "processing array 'fa3' with scalar loop took "
            << std::chrono::duration_cast<std::chrono::milliseconds>
                   ( end - start ) . count()
            << " ms" << std::endl ;

}
