Added load_as / store_as / broadcast_as APIs

JohanMabille · JohanMabille · commit 9b22b9e2c44b · 2021-10-14T13:53:55.000+02:00
diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -70,10 +70,6 @@ namespace xsimd {
     batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) {
       return detail::load_aligned<A>(mem, cvt, A{}, detail::conversion_type<A, T_in, T_out>{});
     }
-    template<class A, class T>
-    batch<std::complex<T>, A> load_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<generic>) {
-      return batch<std::complex<T>, A>::load_aligned(mem);
-    }
 
     // load_unaligned
     namespace detail {
@@ -94,10 +90,6 @@ namespace xsimd {
     batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) {
       return detail::load_unaligned<A>(mem, cvt, generic{}, detail::conversion_type<A, T_in, T_out>{});
     }
-    template<class A, class T>
-    batch<std::complex<T>, A> load_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<generic>) {
-      return batch<std::complex<T>, A>::load_unaligned(mem);
-    }
 
     // store
     template<class T, class A>
@@ -147,39 +139,43 @@ namespace xsimd {
     }
 
     // load_complex_aligned
-    template <class A, class T> batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, requires_arch<generic>) {
-      using real_batch = batch<T, A>;
-      T const *buffer = reinterpret_cast<T const *>(mem);
+    template <class A, class T_out, class T_in>
+    batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) {
+      using real_batch = batch<T_out, A>;
+      T_in const* buffer = reinterpret_cast<T_in const*>(mem);
       real_batch hi = real_batch::load_aligned(buffer),
                  lo = real_batch::load_aligned(buffer + real_batch::size);
       return detail::load_complex(hi, lo, A{});
     }
 
     // load_complex_unaligned
-    template <class A, class T> batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, requires_arch<generic>) {
-      using real_batch = batch<T, A>;
-      T const *buffer = reinterpret_cast<T const *>(mem);
+    template <class A, class T_out, class T_in>
+    batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>> ,requires_arch<generic>) {
+      using real_batch = batch<T_out, A>;
+      T_in const* buffer = reinterpret_cast<T_in const*>(mem);
       real_batch hi = real_batch::load_unaligned(buffer),
                  lo = real_batch::load_unaligned(buffer + real_batch::size);
       return detail::load_complex(hi, lo, A{});
     }
 
     // store_complex_aligned
-    template <class A, class T> void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<generic>) {
-        using real_batch = batch<T, A>;
+    template <class A, class T_out, class T_in>
+    void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) {
+        using real_batch = batch<T_in, A>;
         real_batch hi = detail::complex_high(src, A{});
         real_batch lo = detail::complex_low(src, A{});
-        T * buffer = reinterpret_cast<T*>(dst);
+        T_out* buffer = reinterpret_cast<T_out*>(dst);
         lo.store_aligned(buffer);
         hi.store_aligned(buffer + real_batch::size);
     }
 
     // store_compelx_unaligned
-    template <class A, class T> void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<generic>) {
-        using real_batch = batch<T, A>;
+    template <class A, class T_out, class T_in>
+    void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) {
+        using real_batch = batch<T_in, A>;
         real_batch hi = detail::complex_high(src, A{});
         real_batch lo = detail::complex_low(src, A{});
-        T * buffer = reinterpret_cast<T *>(dst);
+        T_out* buffer = reinterpret_cast<T_out*>(dst);
         lo.store_unaligned(buffer);
         hi.store_unaligned(buffer + real_batch::size);
     }
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -525,7 +525,7 @@ namespace xsimd
          ****************/
 
         template <class A>
-        batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, requires_arch<neon>)
+        batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>)
         {
             using real_batch = batch<float, A>;
             const float* buf = reinterpret_cast<const float*>(mem);
@@ -536,9 +536,9 @@ namespace xsimd
         }
 
         template <class A>
-        batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, requires_arch<neon>)
+        batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>)
         {
-            return load_complex_aligned<A>(mem, A{});
+            return load_complex_aligned<A>(mem, cvt, A{});
         }
 
         /*****************
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -163,7 +163,7 @@ namespace xsimd
          ****************/
 
         template <class A>
-        batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, requires_arch<neon64>)
+        batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>)
         {
             using real_batch = batch<double, A>;
             const double* buf = reinterpret_cast<const double*>(mem);
@@ -174,9 +174,9 @@ namespace xsimd
         }
 
         template <class A>
-        batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, requires_arch<neon64>)
+        batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>)
         {
-            return load_complex_aligned<A>(mem, A{});
+            return load_complex_aligned<A>(mem, cvt, A{});
         }
 
         /*****************
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 
 #include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
 #include "../arch/xsimd_isa.hpp"
 
 namespace xsimd {
@@ -323,6 +324,23 @@ batch<T, A> broadcast(T v) {
   return kernel::broadcast<A>(v, A{});
 }
 
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Creates a batch from the single value \c v and
+ * the specified batch value type \c To.
+ * @param v the value used to initialize the batch
+ * @return a new batch instance
+ */
+template <class To, class A=default_arch, class From>
+simd_return_type<From, To> broadcast_as(From v) {
+    using batch_value_type = typename simd_return_type<From, To>::value_type;
+    using value_type = typename std::conditional<std::is_same<From, bool>::value,
+                                                 bool,
+                                                 batch_value_type>::type;
+    return simd_return_type<From, To>(value_type(v));
+}
+
 /**
  * @ingroup batch_math
  *
@@ -959,6 +977,58 @@ batch<From, A> load_unaligned(From const* ptr) {
   return kernel::load_unaligned<A>(ptr, kernel::convert<From>{}, A{});
 }
 
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Creates a batch from the buffer \c ptr and the specifed
+ * batch value type \c To. The memory needs to be aligned.
+ * @param ptr the memory buffer to read
+ * @return a new batch instance
+ */
+template <class To, class A=default_arch, class From>
+simd_return_type<From, To> load_as(From const* ptr, aligned_mode) {
+  using batch_value_type = typename simd_return_type<From, To>::value_type;
+  return kernel::load_aligned<A>(ptr, kernel::convert<batch_value_type>{}, A{});
+}
+
+template <class To, class A = default_arch>
+simd_return_type<bool, To> load_as(bool const* ptr, aligned_mode) {
+  return simd_return_type<bool, To>::load_aligned(ptr);
+}
+
+template <class To, class A=default_arch, class From>
+simd_return_type<std::complex<From>, To> load_as(std::complex<From> const* ptr, aligned_mode)
+{
+  using batch_value_type = typename simd_return_type<std::complex<From>, To>::value_type;
+  return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type>{}, A{});
+}
+
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Creates a batch from the buffer \c ptr and the specifed
+ * batch value type \c To. The memory does not need to be aligned.
+ * @param ptr the memory buffer to read
+ * @return a new batch instance
+ */
+template <class To, class A=default_arch, class From>
+simd_return_type<From, To> load_as(From const* ptr, unaligned_mode) {
+  using batch_value_type = typename simd_return_type<From, To>::value_type;
+  return kernel::load_unaligned<A>(ptr, kernel::convert<batch_value_type>{}, A{});
+}
+
+template <class To, class A = default_arch>
+simd_return_type<bool, To> load_as(bool const* ptr, unaligned_mode) {
+  return simd_return_type<bool, To>::load_unaligned(ptr);
+}
+
+template <class To, class A=default_arch, class From>
+simd_return_type<std::complex<From>, To> load_as(std::complex<From> const* ptr, unaligned_mode)
+{
+  using batch_value_type = typename simd_return_type<std::complex<From>, To>::value_type;
+  return kernel::load_complex_unaligned<A>(ptr, kernel::convert<batch_value_type>{}, A{});
+}
+
 /**
  * @ingroup batch_math
  *
@@ -1423,8 +1493,8 @@ auto ssub(T const& x, Tp const& y) -> decltype(x - y) {
  * @param mem the memory buffer to write to
  * @param val the batch to copy from
  */
-template<class To, class A, class From>
-void store(From* mem, batch<To, A> const& val, aligned_mode={}) {
+template<class A, class T>
+void store(T* mem, batch<T, A> const& val, aligned_mode={}) {
   return kernel::store_aligned<A>(mem, val, A{});
 }
 
@@ -1436,8 +1506,8 @@ void store(From* mem, batch<To, A> const& val, aligned_mode={}) {
  * @param mem the memory buffer to write to
  * @param val the batch to copy from
  */
-template<class To, class A, class From>
-void store(To* mem, batch<From, A> const& val, unaligned_mode) {
+template<class A, class T>
+void store(T* mem, batch<T, A> const& val, unaligned_mode) {
   return kernel::store_unaligned<A>(mem, val, A{});
 }
 
@@ -1449,8 +1519,8 @@ void store(To* mem, batch<From, A> const& val, unaligned_mode) {
  * @param mem the memory buffer to write to
  * @param val the batch to copy from
  */
-template<class To, class A, class From>
-void store_aligned(To* mem, batch<From, A> const& val) {
+template<class A, class T>
+void store_aligned(T* mem, batch<T, A> const& val) {
   return kernel::store_aligned<A>(mem, val, A{});
 }
 
@@ -1462,11 +1532,82 @@ void store_aligned(To* mem, batch<From, A> const& val) {
  * @param mem the memory buffer to write to
  * @param val the batch to copy
  */
-template<class To, class A, class From>
-void store_unaligned(To* mem, batch<From, A> const& val) {
+template<class A, class T>
+void store_unaligned(T* mem, batch<T, A> const& val) {
   return kernel::store_unaligned<A>(mem, val, A{});
 }
 
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Copy content of batch \c src to the buffer \c dst. The
+ * memory needs to be aligned.
+ * @param mem the memory buffer to write to
+ * @param val the batch to copy
+ */
+template <class To, class A=default_arch, class From>
+void store_as(To* dst, batch<From, A> const& src, aligned_mode) {
+  kernel::store_aligned(dst, src, A{});
+}
+
+template <class A=default_arch, class From>
+void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) {
+  kernel::store(src, dst, A{});
+}
+
+template <class To, class A=default_arch, class From>
+void store_as(std::complex<To>* dst, batch<std::complex<From>,A> const& src, aligned_mode) {
+  kernel::store_complex_aligned(dst, src, A{});
+}
+
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Copy content of batch \c src to the buffer \c dst. The
+ * memory does not need to be aligned.
+ * @param mem the memory buffer to write to
+ * @param val the batch to copy
+ */
+template <class To, class A=default_arch, class From>
+void store_as(To* dst, batch<From, A> const& src, unaligned_mode) {
+  kernel::store_unaligned(dst, src, A{});
+}
+
+template <class A=default_arch, class From>
+void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) {
+  kernel::store(src, dst, A{});
+}
+
+template <class To, class A=default_arch, class From>
+void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) {
+  kernel::store_complex_unaligned(dst, src, A{});
+}
+
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Copy content of batch of boolean \c src to the buffer \c dst. The
+ * memory needs to be aligned.
+ * @param mem the memory buffer to write to
+ * @param val the batch to copy
+ */
+template <class To, class A=default_arch, class From>
+void store_batch(To* dst, batch_bool<From, A> const& src, aligned_mode) {
+  kernel::store(src, dst, A{});
+}
+
+/**
+ * @ingroup batch_data_transfer
+ *
+ * Copy content of batch of boolean \c src to the buffer \c dst. The
+ * memory does not need to be aligned.
+ * @param mem the memory buffer to write to
+ * @param val the batch to copy
+ */
+template <class To, class A=default_arch, class From>
+void store_batch(To* dst, batch_bool<From, A> const& src, unaligned_mode) {
+  kernel::store(src, dst, A{});
+}
 /**
  * @ingroup batch_arithmetic
  *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
@@ -952,13 +952,13 @@ namespace xsimd
     template<class T, class A>
     batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src)
     {
-        return kernel::load_complex_aligned<A>(src, A{});
+        return kernel::load_complex_aligned<A>(src, kernel::convert<value_type>{}, A{});
     }
 
     template<class T, class A>
     batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src)
     {
-        return kernel::load_complex_unaligned<A>(src, A{});
+        return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type>{}, A{});
     }
 
     template<class T, class A>
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
@@ -14,7 +14,7 @@
 
 #include <type_traits>
 
-#include "xsimd_api.hpp"
+#include "xsimd_batch.hpp"
 
 namespace xsimd
 {
diff --git a/test/test_api.cpp b/test/test_api.cpp
@@ -123,10 +123,10 @@ class xsimd_api_test : public testing::Test
         batch_type b = batch_type::load(v.data(), xsimd::aligned_mode());
         V res(size);
 
-        xsimd::store(res.data(), b, xsimd::unaligned_mode());
+        xsimd::store_as(res.data(), b, xsimd::unaligned_mode());
         EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " unaligned");
 
-        xsimd::store(res.data(), b, xsimd::aligned_mode());
+        xsimd::store_as(res.data(), b, xsimd::aligned_mode());
         EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " aligned");
     }
 
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
@@ -123,6 +123,12 @@ class load_store_test : public testing::Test
 
         b = batch_type::load_aligned(v.data());
         EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " aligned");
+
+        b = xsimd::load_as<value_type>(v.data(), xsimd::unaligned_mode());
+        EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " unaligned (load_as)");
+
+        b = xsimd::load_as<value_type>(v.data(), xsimd::aligned_mode());
+        EXPECT_BATCH_EQ(b, expected) << print_function_name(name + " aligned (load_as)");
     }
 
     template <class V>
@@ -136,6 +142,12 @@ class load_store_test : public testing::Test
 
         b.store_aligned(res.data());
         EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " aligned");
+
+        xsimd::store_as(res.data(), b, xsimd::unaligned_mode());
+        EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " unaligned (store_as)");
+
+        xsimd::store_as(res.data(), b, xsimd::aligned_mode());
+        EXPECT_VECTOR_EQ(res, v) << print_function_name(name + " aligned (store_as)");
     }
 
     template <class V>

Original file line number	Diff line number	Diff line change
`@@ -525,7 +525,7 @@ namespace xsimd`
`525`	`525`	`****************/`
`526`	`526`
`527`	`527`	`template <class A>`
`528`		`- batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, requires_arch<neon>)`
	`528`	`+ batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>)`
`529`	`529`	`{`
`530`	`530`	`using real_batch = batch<float, A>;`
`531`	`531`	`const float* buf = reinterpret_cast<const float*>(mem);`
`@@ -536,9 +536,9 @@ namespace xsimd`
`536`	`536`	`}`
`537`	`537`
`538`	`538`	`template <class A>`
`539`		`- batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, requires_arch<neon>)`
	`539`	`+ batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>)`
`540`	`540`	`{`
`541`		`- return load_complex_aligned<A>(mem, A{});`
	`541`	`+ return load_complex_aligned<A>(mem, cvt, A{});`
`542`	`542`	`}`
`543`	`543`
`544`	`544`	`/*****************`
Original file line number	Diff line number	Diff line change
`@@ -163,7 +163,7 @@ namespace xsimd`
`163`	`163`	`****************/`
`164`	`164`
`165`	`165`	`template <class A>`
`166`		`- batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, requires_arch<neon64>)`
	`166`	`+ batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>)`
`167`	`167`	`{`
`168`	`168`	`using real_batch = batch<double, A>;`
`169`	`169`	`const double* buf = reinterpret_cast<const double*>(mem);`
`@@ -174,9 +174,9 @@ namespace xsimd`
`174`	`174`	`}`
`175`	`175`
`176`	`176`	`template <class A>`
`177`		`- batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, requires_arch<neon64>)`
	`177`	`+ batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>)`
`178`	`178`	`{`
`179`		`- return load_complex_aligned<A>(mem, A{});`
	`179`	`+ return load_complex_aligned<A>(mem, cvt, A{});`
`180`	`180`	`}`
`181`	`181`
`182`	`182`	`/*****************`
Original file line number	Diff line number	Diff line change
`@@ -952,13 +952,13 @@ namespace xsimd`
`952`	`952`	`template<class T, class A>`
`953`	`953`	`batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src)`
`954`	`954`	`{`
`955`		`- return kernel::load_complex_aligned<A>(src, A{});`
	`955`	`+ return kernel::load_complex_aligned<A>(src, kernel::convert<value_type>{}, A{});`
`956`	`956`	`}`
`957`	`957`
`958`	`958`	`template<class T, class A>`
`959`	`959`	`batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src)`
`960`	`960`	`{`
`961`		`- return kernel::load_complex_unaligned<A>(src, A{});`
	`961`	`+ return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type>{}, A{});`
`962`	`962`	`}`
`963`	`963`
`964`	`964`	`template<class T, class A>`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`
`15`	`15`	`#include <type_traits>`
`16`	`16`
`17`		`-#include "xsimd_api.hpp"`
	`17`	`+#include "xsimd_batch.hpp"`
`18`	`18`
`19`	`19`	`namespace xsimd`
`20`	`20`	`{`