00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00039 #ifndef MM_KERNEL_INNER_SSE2_A_H
00040 #define MM_KERNEL_INNER_SSE2_A_H
00041 #include "common.h"
00042 #include "vector_intrin.h"
00043
00062 template<typename T_real, typename T_reg, int T_M, int T_N, int T_K>
00063 class MM_kernel_inner_sse2_A {
00064 public:
00065 typedef T_real real;
00066 static int const M = T_M;
00067 static int const N = T_N;
00068 static int const K = T_K;
00069 protected:
00070 static int const floats_per_register = ( sizeof(T_reg) / sizeof(real) );
00073 private:
00075 template<int T_ROWS_kernel, int T_COLS_kernel, typename T_ordering_kernel, int T_repetitions>
00076 class Pack;
00077 public:
00078 typedef Pack< M, K, Ordering_col_wise, 1 > Pack_type_A;
00079 typedef Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B;
00080 typedef Pack< M, N, Ordering_col_wise, 1 > Pack_type_C;
00084
00085
00086
00090 static void exec( real const * const * const A,
00091 real const * const * const B,
00092 real * const C,
00093 int const i = 1,
00094 int const offset_A = 0,
00095 int const offset_B = 0,
00096 int const offset_C = 0 );
00097
00098 template<int T_offset_A, int T_offset_B, int T_offset_C>
00099 static void exec( real const * const * const A,
00100 real const * const * const B,
00101 real * const C,
00102 int const i = 1 );
00103
00104
00105
00106 protected:
00107 template<int T_loop_index, int T_end>
00108 struct Loop {
00109 static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {
00110 X_reg[T_loop_index].set_to_zero();
00111 Loop<T_loop_index+1, T_end>::set_to_zero( X_reg );
00112 }
00113 static inline void ALWAYS_INLINE inner( int const row_A_reg,
00114 int const row_B,
00115 Vector_intrin<real, T_reg> const & A_reg,
00116 Vector_intrin<real, T_reg> * C_reg,
00117 real const * B_packed ) {
00118 Vector_intrin<real, T_reg> B_reg;
00119 B_reg.load_p( &B_packed[row_B * T_N * floats_per_register +
00120 T_loop_index * floats_per_register] );
00121 B_reg *= A_reg;
00122 C_reg[row_A_reg + T_loop_index * T_M / floats_per_register] += B_reg;
00123 Loop<T_loop_index+1, T_end>::inner( row_A_reg, row_B,
00124 A_reg, C_reg,
00125 B_packed );
00126 }
00127 static inline void ALWAYS_INLINE middle( int const col_A,
00128 Vector_intrin<real, T_reg> * C_reg,
00129 real const * A,
00130 real const * B_packed ) {
00131 Vector_intrin<real, T_reg> A_reg;
00132 A_reg.load_p( &A[col_A * T_M + T_loop_index * floats_per_register] );
00133
00134 Loop<0, T_N>::inner( T_loop_index,
00135 col_A,
00136 A_reg,
00137 C_reg,
00138 B_packed );
00139 Loop<T_loop_index+1, T_end>::middle( col_A, C_reg, A, B_packed );
00140 }
00141 static inline void ALWAYS_INLINE outer( int const start_i,
00142 Vector_intrin<real, T_reg> * C_reg,
00143 real const * A,
00144 real const * B_packed ) {
00145
00146 Loop<0, T_M/floats_per_register>::middle( start_i + T_loop_index,
00147 C_reg,
00148 A,
00149 B_packed );
00150 Loop<T_loop_index+1, T_end>::outer( start_i, C_reg, A, B_packed );
00151 }
00152 static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
00153 real const * X ) {
00154 X_reg[T_loop_index] += &X[T_loop_index * floats_per_register];
00155 Loop<T_loop_index+1, T_end>::add( X_reg, X );
00156 }
00157 static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
00158 real * X ) {
00159 X_reg[T_loop_index].store_p( &X[T_loop_index * floats_per_register] );
00160 Loop<T_loop_index+1, T_end>::store( X_reg, X );
00161 }
00162
00163 static inline void ALWAYS_INLINE multiple_loop( Vector_intrin<real, T_reg> * C_reg,
00164 real const * const * const A,
00165 real const * const * const B ) {
00166
00167 Loop<0, T_K>::outer( 0, C_reg, A[T_loop_index], B[T_loop_index] );
00168 Loop<T_loop_index+1, T_end>::multiple_loop( C_reg, A, B );
00169 }
00170 };
00171
00172 template<int T_end>
00173 struct Loop<T_end, T_end> {
00174 static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {}
00175 static inline void ALWAYS_INLINE inner( int const row_A_reg,
00176 int const row_B,
00177 Vector_intrin<real, T_reg> const & A_reg,
00178 Vector_intrin<real, T_reg> * C_reg,
00179 real const * B_packed ) {}
00180 static inline void ALWAYS_INLINE middle( int const col_A,
00181 Vector_intrin<real, T_reg> * C_reg,
00182 real const * A,
00183 real const * B_packed ) {}
00184 static inline void ALWAYS_INLINE outer( int const start_i,
00185 Vector_intrin<real, T_reg> * C_reg,
00186 real const * A,
00187 real const * B_packed ) {}
00188 static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
00189 real const * X ) {}
00190 static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
00191 real * X ) {}
00192 static inline void ALWAYS_INLINE multiple_loop( Vector_intrin<real, T_reg> * C_reg,
00193 real const * const * const A,
00194 real const * const * const B ) {}
00195 };
00196 };
00197
00198
00199
00200
00201 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
00202 void MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::exec(real const * const * const A,
00203 real const * const * const B,
00204 real * const C,
00205 int const i,
00206 int const offset_A,
00207 int const offset_B,
00208 int const offset_C) {
00209 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
00210 Vector_intrin<real, T_reg> C_reg[T_M * T_N / floats_per_register];
00211 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
00212 #if 1 // I loose a bit performance because of the offsets
00213 for (int ind = 0; ind < i; ++ind)
00214 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + offset_A, B[ind] + offset_B );
00216
00217 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + offset_C);
00218 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + offset_C);
00219 #else
00220 for (int ind = 0; ind < i; ++ind)
00221 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind], B[ind] );
00223
00224 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C);
00225 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C);
00226 #endif
00227 }
00228
00229 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
00230 template<int T_offset_A, int T_offset_B, int T_offset_C>
00231 void MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::exec( real const * const * const A,
00232 real const * const * const B,
00233 real * const C,
00234 int const i ) {
00235 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
00236 Vector_intrin<real, T_reg> C_reg[T_M * T_N / floats_per_register];
00237 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
00238 for (int ind = 0; ind < i; ++ind)
00239 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + T_offset_A, B[ind] + T_offset_B );
00241
00242 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + T_offset_C);
00243 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + T_offset_C);
00244 }
00245
00246
00247
00248
00249
00250
00252 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
00253 template<int T_rows, int T_cols, typename T_ordering_kernel, int T_repetitions>
00254 class MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::Pack {
00255 public:
00256 static int const size_packed = T_rows * T_cols * T_repetitions;
00257 static int const rows = T_rows;
00258 static int const cols = T_cols;
00259
00260 template<typename T_ordering_matrix>
00261 struct Assign_to_packed {
00262 typedef real * PtrTypePacked;
00263 typedef real const * const PtrType;
00264 inline static void exec( PtrType X, PtrTypePacked X_packed,
00265 int const row_k,
00266 int const col_k,
00267 int const rows_total_matrix,
00268 int const cols_total_matrix ) {
00269 for ( int ir = 0; ir < T_repetitions; ++ir)
00270 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
00271 = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
00272 }
00273 };
00274
00275 template<typename T_ordering_matrix>
00276 struct Extract_from_packed {
00277 typedef real const * const PtrTypePacked;
00278 typedef real * PtrType;
00279 inline static void exec( PtrType X, PtrTypePacked X_packed,
00280 int const row_k,
00281 int const col_k,
00282 int const rows_total_matrix,
00283 int const cols_total_matrix ) {
00284 for ( int ir = 0; ir < T_repetitions; ++ir)
00285 X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
00286 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
00287 }
00288 };
00289
00290 template<template<typename T_ordering> class T_assign, typename T_ordering_matrix>
00291 static void exec(typename T_assign<T_ordering_matrix>::PtrType X,
00292 typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
00293 int const rows_total_matrix, int const cols_total_matrix) {
00294
00295 for ( int col_k = 0; col_k < T_cols; ++col_k ) {
00296
00297 for ( int row_k = 0; row_k < T_rows; ++row_k ) {
00298 T_assign<T_ordering_matrix>::exec( X, X_packed,
00299 row_k, col_k,
00300 rows_total_matrix, cols_total_matrix );
00301 }
00302 }
00303 }
00304
00309 template<typename T_ordering_matrix>
00310 inline static void pack(real const * const X, real * X_packed,
00311 int const rows_total_matrix, int const cols_total_matrix) {
00312 exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
00313 }
00318 template<typename T_ordering_matrix>
00319 inline static void unpack(real * X, real const * const X_packed,
00320 int const rows_total_matrix, int const cols_total_matrix) {
00321 exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
00322 }
00323 };
00324 #endif