|
|
@ -8,102 +8,318 @@ |
|
|
|
#include "vector_loader.h" |
|
|
|
#include "const.h" |
|
|
|
|
|
|
|
template<typename base_t, load_mode load_mode> |
|
|
|
class AggregationSUM { |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Super Class for all Aggregation functions. Guards Sub Classes from having an non integral base type. |
|
|
|
* |
|
|
|
* @tparam T |
|
|
|
*/ |
|
|
|
template <typename T> |
|
|
|
class AggFunction { |
|
|
|
static_assert(std::is_integral<T>::value, "The base type of an AggFunction must be an integral"); |
|
|
|
}; |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Template class that implements methods used for Summation. It wraps the corresponding vector intrinsics |
|
|
|
* |
|
|
|
* @tparam T base datatype for the implemented methods |
|
|
|
*/ |
|
|
|
template<typename T> |
|
|
|
class Sum : public AggFunction<T> { |
|
|
|
public: |
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
|
|
|
if constexpr (sizeof(base_t) == 4) return _mm512_add_epi32(aggregator, vector); |
|
|
|
else if constexpr (sizeof(base_t) == 8) return _mm512_add_epi64(aggregator, vector); |
|
|
|
static_assert(sizeof(base_t) == 4 || sizeof(base_t) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_add_epi32(aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_add_epi64(aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
}; |
|
|
|
|
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
|
|
|
if constexpr (sizeof(base_t) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector); |
|
|
|
else if constexpr (sizeof(base_t) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector); |
|
|
|
static_assert(sizeof(base_t) == 4 || sizeof(base_t) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_mask_add_epi32(aggregator, mask, aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_add_epi64(aggregator, mask, aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
}; |
|
|
|
|
|
|
|
static inline base_t simd_reduce(__m512i vector) { |
|
|
|
if constexpr (sizeof(base_t) == 4) return _mm512_reduce_add_epi32(vector); |
|
|
|
else if constexpr (sizeof(base_t) == 8) return _mm512_reduce_add_epi64(vector); |
|
|
|
static_assert(sizeof(base_t) == 4 || sizeof(base_t) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
static inline T simd_reduce(__m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_add_epi32(vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_add_epi64(vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Sum is only implemented for 32 and 64 wide integers"); |
|
|
|
}; |
|
|
|
|
|
|
|
static inline base_t scalar_agg(base_t aggregator, base_t scalar) { return aggregator + scalar; }; |
|
|
|
static inline T scalar_agg(T aggregator, T scalar) { return aggregator + scalar; }; |
|
|
|
|
|
|
|
static inline __m512i zero() { return _mm512_set1_epi32(0); }; |
|
|
|
}; |
|
|
|
|
|
|
|
static_assert(std::is_same_v<base_t, uint64_t>, "Enforce unsigned 64 bit ints."); |
|
|
|
|
|
|
|
/* |
|
|
|
* returns time in ns spent loading vector |
|
|
|
*/ |
|
|
|
template<size_t CHUNK_SIZE_B> |
|
|
|
static bool apply(base_t *dest, base_t *src) { |
|
|
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
|
|
|
constexpr size_t value_count = CHUNK_SIZE_B / sizeof(base_t); |
|
|
|
constexpr size_t iterations = value_count - lanes + 1; |
|
|
|
/** |
|
|
|
* @brief Template class that implements methods used for Maximum determination. It wraps the corresponding vector intrinsics |
|
|
|
* |
|
|
|
* @tparam T base datatype for the implemented methods |
|
|
|
* |
|
|
|
*/ |
|
|
|
template<typename T> |
|
|
|
class Max : public AggFunction<T> { |
|
|
|
public: |
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_max_epi32(aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_max_epi64(aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
static_assert(value_count >= lanes); |
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_mask_max_epi32(aggregator, mask, aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_max_epi64(aggregator, mask, aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
__m512i agg_vec = zero(); |
|
|
|
size_t i = 0; |
|
|
|
static inline T simd_reduce(__m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_max_epi32(vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_max_epi64(vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
for(size_t i = 0; i < iterations; i += lanes) { |
|
|
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
|
|
|
static inline T scalar_agg(T aggregator, T scalar) { return std::max(aggregator, scalar); } |
|
|
|
|
|
|
|
agg_vec = simd_agg(agg_vec, vec); |
|
|
|
static inline __m512i zero() { |
|
|
|
if constexpr (sizeof(T) == 4) { |
|
|
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFF); |
|
|
|
else return _mm512_set1_epi32(0x0); |
|
|
|
} |
|
|
|
else if constexpr (sizeof(T) == 8) { |
|
|
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); |
|
|
|
else return _mm512_set1_epi32(0x0); |
|
|
|
} |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Max is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
}; |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Template class that implements methods used for Minimum determination. It wraps the corresponding vector intrinsics |
|
|
|
* |
|
|
|
* @tparam T base datatype for the implemented methods |
|
|
|
* |
|
|
|
*/ |
|
|
|
template<typename T> |
|
|
|
class Min : public AggFunction<T> { |
|
|
|
public: |
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_min_epi32(aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_min_epi64(aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
base_t result = simd_reduce(agg_vec); |
|
|
|
static inline __m512i simd_agg(__m512i aggregator, __mmask16 mask, __m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_mask_min_epi32(aggregator, mask, aggregator, vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_mask_min_epi64(aggregator, mask, aggregator, vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
for(; i < value_count; ++i) { |
|
|
|
result = scalar_agg(result, src[i]); |
|
|
|
static inline T simd_reduce(__m512i vector) { |
|
|
|
if constexpr (sizeof(T) == 4) return _mm512_reduce_min_epi32(vector); |
|
|
|
else if constexpr (sizeof(T) == 8) return _mm512_reduce_min_epi64(vector); |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
|
|
|
|
static inline T scalar_agg(T aggregator, T scalar) { return std::min(aggregator, scalar); } |
|
|
|
|
|
|
|
static inline __m512i zero() { |
|
|
|
if constexpr (sizeof(T) == 4) { |
|
|
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFF); |
|
|
|
else return _mm512_set1_epi32(0xFFFFFFFF); |
|
|
|
} |
|
|
|
else if constexpr (sizeof(T) == 8) { |
|
|
|
if constexpr (std::is_signed<T>::value) return _mm512_set1_epi32(0xEFFFFFFFFFFFFFFF); |
|
|
|
else return _mm512_set1_epi32(0xFFFFFFFFFFFFFFFF); |
|
|
|
} |
|
|
|
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "Min is only implemented for 32 and 64 wide integers"); |
|
|
|
} |
|
|
|
}; |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Template Class that implements an aggregation operation. |
|
|
|
* |
|
|
|
* @tparam base_t Base type of the values for aggregation |
|
|
|
* @tparam func |
|
|
|
* @tparam load_mode |
|
|
|
*/ |
|
|
|
template<typename base_t, template<typename _base_t> class func, load_mode load_mode> |
|
|
|
class Aggregation{ |
|
|
|
public: |
|
|
|
|
|
|
|
static_assert(std::is_same_v<base_t, uint64_t>, "Enforce unsigned 64 bit ints."); |
|
|
|
|
|
|
|
using OP = func<base_t>; |
|
|
|
/** |
|
|
|
* @brief Calculates the memory maximal needed to store a chunk's processing result. |
|
|
|
* |
|
|
|
* @param chunk_size_b Size of the chunk in byte |
|
|
|
* @return size_t Size of the chunk's processing result in byte |
|
|
|
*/ |
|
|
|
static size_t result_bytes_per_chunk(size_t chunk_size_b) { |
|
|
|
// aggregation returns a single value of type base_t |
|
|
|
return sizeof(base_t); |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes. |
|
|
|
* The result is written to main memory. |
|
|
|
* |
|
|
|
* @param dest Pointer to the start of the result chunk |
|
|
|
* @param src Pointer to the start of the source chunk |
|
|
|
* @param chunk_size_b Size of the source chunk in bytes |
|
|
|
* @return true When the aggregation is done |
|
|
|
* @return false Never |
|
|
|
*/ |
|
|
|
static bool apply (base_t *dest, base_t *src, size_t chunk_size_b) { |
|
|
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
|
|
|
size_t value_count = chunk_size_b / sizeof(base_t); |
|
|
|
__m512i agg_vec = func<base_t>::zero(); |
|
|
|
size_t i = 0; |
|
|
|
base_t result = 0; |
|
|
|
// stop before! running out of space |
|
|
|
if(value_count >= lanes) {// keep in mind value_count is unsigned so if it becomes negative, it doesn't. |
|
|
|
for(; i <= value_count - lanes; i += lanes) { |
|
|
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
|
|
|
|
|
|
|
agg_vec = func<base_t>::simd_agg(agg_vec, vec); |
|
|
|
} |
|
|
|
result = func<base_t>::simd_reduce(agg_vec); |
|
|
|
} |
|
|
|
|
|
|
|
for(; i < value_count; ++i) { |
|
|
|
result = func<base_t>::scalar_agg(result, src[i]); |
|
|
|
} |
|
|
|
*dest = result; |
|
|
|
|
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
* returns time in ns spent loading vector |
|
|
|
*/ |
|
|
|
template<size_t CHUNK_SIZE_B> |
|
|
|
static uint64_t apply_masked(__m512i& dest, base_t *src, uint16_t* msks) { |
|
|
|
/** |
|
|
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
|
|
|
* while applying the bit string stored in *masks*. The result is written to main memory. |
|
|
|
* |
|
|
|
* @param dest Pointer to the start of the result chunk |
|
|
|
* @param src Pointer to the start of the source chunk |
|
|
|
* @param masks Pointer the bitstring that marks the values that should be aggregated |
|
|
|
* @param chunk_size_b Size of the source chunk in bytes |
|
|
|
* @return true When the aggregation is done |
|
|
|
* @return false Never |
|
|
|
*/ |
|
|
|
static bool apply_masked (base_t *dest, base_t *src, uint16_t* msks, size_t chunk_size_b) { |
|
|
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
|
|
|
constexpr size_t value_count = CHUNK_SIZE_B / sizeof(base_t); |
|
|
|
constexpr size_t iterations = value_count - lanes + 1; |
|
|
|
uint8_t* masks = (uint8_t *)msks; |
|
|
|
size_t value_count = chunk_size_b / sizeof(base_t); |
|
|
|
__m512i agg_vec = func<base_t>::zero(); |
|
|
|
size_t i = 0; |
|
|
|
|
|
|
|
static_assert(value_count >= lanes); |
|
|
|
// stop before! running out of space |
|
|
|
if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. |
|
|
|
for(; i <= value_count - lanes; i += lanes) { |
|
|
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
|
|
|
__mmask8 mask = _mm512_int2mask(masks[i / lanes]); |
|
|
|
|
|
|
|
uint64_t load_time = 0; |
|
|
|
agg_vec = func<base_t>::simd_mask_agg(agg_vec, mask, vec); |
|
|
|
} |
|
|
|
*dest = func<base_t>::simd_reduce(agg_vec); |
|
|
|
|
|
|
|
auto* masks = reinterpret_cast<uint8_t*>(msks); |
|
|
|
for(; i < value_count; ++i) { |
|
|
|
uint8_t mask = masks[i / lanes]; |
|
|
|
if(mask & (0b1 << (i % lanes))){ |
|
|
|
*dest = func<base_t>::scalar_agg(*dest, src[i]); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
|
|
|
* while applying the bit string stored in *masks*. The values are agggegated in the register *dest* without |
|
|
|
* clearing beforehand. |
|
|
|
* |
|
|
|
* NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte |
|
|
|
* |
|
|
|
* @param dest Vector register used for storing and passing the result around |
|
|
|
* @param src Pointer to the start of the source chunk |
|
|
|
* @param masks Pointer the bitstring that marks the values that should be aggregated |
|
|
|
* @param chunk_size_b Size of the source chunk in bytes |
|
|
|
* @return __m512i Vector register holding the aggregation result |
|
|
|
*/ |
|
|
|
static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks, size_t chunk_size_b, uint64_t* time_load) { |
|
|
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
|
|
|
uint8_t* masks = (uint8_t*) msks; |
|
|
|
//TODO this function does not work if value_count % lanes != 0 |
|
|
|
size_t value_count = chunk_size_b / sizeof(base_t); |
|
|
|
size_t i = 0; |
|
|
|
|
|
|
|
for(size_t i = 0; i < iterations; i += lanes) { |
|
|
|
auto ts_load = std::chrono::steady_clock::now(); |
|
|
|
*time_load = 0; |
|
|
|
|
|
|
|
// stop before! running out of space |
|
|
|
if(value_count >= lanes) // keep in mind size_w is unsigned so if it becomes negative, it doesn't. |
|
|
|
for(; i <= value_count - lanes; i += lanes) { |
|
|
|
const auto ts = std::chrono::steady_clock::now(); |
|
|
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
|
|
|
|
|
|
|
auto te_load = std::chrono::steady_clock::now(); |
|
|
|
load_time += std::chrono::duration_cast<std::chrono::nanoseconds>(te_load - ts_load).count(); |
|
|
|
const auto te = std::chrono::steady_clock::now(); |
|
|
|
*time_load += std::chrono::duration_cast<std::chrono::nanoseconds>(te - ts).count(); |
|
|
|
|
|
|
|
__mmask8 mask = _mm512_int2mask(masks[i / lanes]); |
|
|
|
dest = func<base_t>::simd_agg(dest, mask, vec); |
|
|
|
} |
|
|
|
|
|
|
|
return dest; |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Applies the aggregation function on the chunk starting at *src* and spanning *chunk_size_b* bytes, |
|
|
|
* while applying two bit strings stored in *masks_0* and *masks_1*. The values are aggregated in the register |
|
|
|
* *dest* without clearing beforehand. |
|
|
|
* |
|
|
|
* NOTE! This function only works correctly if the the chunk_size_b is a multiple of 64 byte |
|
|
|
* |
|
|
|
* @param dest Vector register used for storing and passing the result around |
|
|
|
* @param src Pointer to the start of the source chunk |
|
|
|
* @param masks_0 Pointer the bitstring that marks the values that should be aggregated |
|
|
|
* @param masks_1 Pointer the bitstring that marks the values that should be aggregated |
|
|
|
* @param chunk_size_b Size of the source chunk in bytes |
|
|
|
* @return __m512i Vector register holding the aggregation result |
|
|
|
*/ |
|
|
|
static __m512i apply_masked (__m512i dest, base_t *src, uint16_t* msks0, uint16_t* msks1, size_t chunk_size_b) { |
|
|
|
constexpr size_t lanes = VECTOR_SIZE<base_t>(); |
|
|
|
uint8_t* masks0 = (uint8_t*) msks0; |
|
|
|
uint8_t* masks1 = (uint8_t*) msks1; |
|
|
|
//TODO this function does not work if value_count % lanes != 0 |
|
|
|
size_t value_count = chunk_size_b / sizeof(base_t); |
|
|
|
size_t i = 0; |
|
|
|
// stop before! running out of space |
|
|
|
if(value_count >= lanes) // keep in mind value_count is unsigned so if it becomes negative, it doesn't. |
|
|
|
for(; i <= value_count - lanes; i += lanes) { |
|
|
|
__m512i vec = Vector_Loader<base_t, load_mode>::load(src + i); |
|
|
|
__mmask8 mask0 = _mm512_int2mask(masks0[i / lanes]); |
|
|
|
__mmask8 mask1 = _mm512_int2mask(masks1[i / lanes]); |
|
|
|
|
|
|
|
dest = simd_agg(dest, mask, vec); |
|
|
|
mask0 = _kand_mask8(mask0, mask1); |
|
|
|
dest = func<base_t>::simd_agg(dest, mask0, vec); |
|
|
|
} |
|
|
|
|
|
|
|
return load_time; |
|
|
|
return dest; |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* @brief Reduces a vector by applying the aggregation function horizontally. |
|
|
|
* |
|
|
|
* @param dest Result of the horizontal aggregation |
|
|
|
* @param src Vector as source for the horizontal aggregation |
|
|
|
* @return true When the operation is done |
|
|
|
* @return false Never |
|
|
|
*/ |
|
|
|
static bool happly (base_t *dest, __m512i src) { |
|
|
|
*dest = simd_reduce(src); |
|
|
|
*dest = func<base_t>::simd_reduce(src); |
|
|
|
|
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
static __m512i get_zero() { |
|
|
|
return zero(); |
|
|
|
return func<base_t>::zero(); |
|
|
|
} |
|
|
|
}; |