1 /* blas/source_her2k_c.h
3 * Copyright (C) 2001, 2007 Brian Gough
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or (at
8 * your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 const BASE alpha_real = CONST_REAL0(alpha);
25 BASE alpha_imag = CONST_IMAG0(alpha);
27 if (beta == 1.0 && ((alpha_real == 0.0 && alpha_imag == 0.0) || K == 0))
30 if (Order == CblasRowMajor) {
34 uplo = (Uplo == CblasUpper) ? CblasLower : CblasUpper;
35 trans = (Trans == CblasNoTrans) ? CblasConjTrans : CblasNoTrans;
36 alpha_imag *= -1; /* conjugate alpha */
39 /* form C := beta*C */
42 if (uplo == CblasUpper) {
43 for (i = 0; i < N; i++) {
44 for (j = i; j < N; j++) {
45 REAL(C, ldc * i + j) = 0.0;
46 IMAG(C, ldc * i + j) = 0.0;
50 for (i = 0; i < N; i++) {
51 for (j = 0; j <= i; j++) {
52 REAL(C, ldc * i + j) = 0.0;
53 IMAG(C, ldc * i + j) = 0.0;
57 } else if (beta != 1.0) {
58 if (uplo == CblasUpper) {
59 for (i = 0; i < N; i++) {
60 REAL(C, ldc * i + i) *= beta;
61 IMAG(C, ldc * i + i) = 0.0;
62 for (j = i + 1; j < N; j++) {
63 REAL(C, ldc * i + j) *= beta;
64 IMAG(C, ldc * i + j) *= beta;
68 for (i = 0; i < N; i++) {
69 for (j = 0; j < i; j++) {
70 REAL(C, ldc * i + j) *= beta;
71 IMAG(C, ldc * i + j) *= beta;
73 REAL(C, ldc * i + i) *= beta;
74 IMAG(C, ldc * i + i) = 0.0;
78 for (i = 0; i < N; i++) {
79 IMAG(C, ldc * i + i) = 0.0;
83 if (alpha_real == 0.0 && alpha_imag == 0.0)
86 if (uplo == CblasUpper && trans == CblasNoTrans) {
88 for (i = 0; i < N; i++) {
90 /* Cii += alpha Aik conj(Bik) + conj(alpha) Bik conj(Aik) */
93 /* BASE temp_imag = 0.0; */
94 for (k = 0; k < K; k++) {
95 const BASE Aik_real = CONST_REAL(A, i * lda + k);
96 const BASE Aik_imag = CONST_IMAG(A, i * lda + k);
97 /* temp1 = alpha * Aik */
98 const BASE temp1_real = alpha_real * Aik_real - alpha_imag * Aik_imag;
99 const BASE temp1_imag = alpha_real * Aik_imag + alpha_imag * Aik_real;
100 const BASE Bik_real = CONST_REAL(B, i * ldb + k);
101 const BASE Bik_imag = CONST_IMAG(B, i * ldb + k);
102 temp_real += temp1_real * Bik_real + temp1_imag * Bik_imag;
105 REAL(C, i * ldc + i) += 2 * temp_real;
106 IMAG(C, i * ldc + i) = 0.0;
109 /* Cij += alpha Aik conj(Bjk) + conj(alpha) Bik conj(Ajk) */
110 for (j = i + 1; j < N; j++) {
111 BASE temp_real = 0.0;
112 BASE temp_imag = 0.0;
113 for (k = 0; k < K; k++) {
114 const BASE Aik_real = CONST_REAL(A, i * lda + k);
115 const BASE Aik_imag = CONST_IMAG(A, i * lda + k);
116 /* temp1 = alpha * Aik */
117 const BASE temp1_real = alpha_real * Aik_real - alpha_imag * Aik_imag;
118 const BASE temp1_imag = alpha_real * Aik_imag + alpha_imag * Aik_real;
119 const BASE Bik_real = CONST_REAL(B, i * ldb + k);
120 const BASE Bik_imag = CONST_IMAG(B, i * ldb + k);
122 const BASE Ajk_real = CONST_REAL(A, j * lda + k);
123 const BASE Ajk_imag = CONST_IMAG(A, j * lda + k);
124 /* temp2 = alpha * Ajk */
125 const BASE temp2_real = alpha_real * Ajk_real - alpha_imag * Ajk_imag;
126 const BASE temp2_imag = alpha_real * Ajk_imag + alpha_imag * Ajk_real;
127 const BASE Bjk_real = CONST_REAL(B, j * ldb + k);
128 const BASE Bjk_imag = CONST_IMAG(B, j * ldb + k);
130 /* Cij += alpha * Aik * conj(Bjk) + conj(alpha) * Bik * conj(Ajk) */
131 temp_real += ((temp1_real * Bjk_real + temp1_imag * Bjk_imag)
132 + (Bik_real * temp2_real + Bik_imag * temp2_imag));
133 temp_imag += ((temp1_real * (-Bjk_imag) + temp1_imag * Bjk_real)
134 + (Bik_real * (-temp2_imag) + Bik_imag * temp2_real));
136 REAL(C, i * ldc + j) += temp_real;
137 IMAG(C, i * ldc + j) += temp_imag;
141 } else if (uplo == CblasUpper && trans == CblasConjTrans) {
143 for (k = 0; k < K; k++) {
144 for (i = 0; i < N; i++) {
145 BASE Aki_real = CONST_REAL(A, k * lda + i);
146 BASE Aki_imag = CONST_IMAG(A, k * lda + i);
147 BASE Bki_real = CONST_REAL(B, k * ldb + i);
148 BASE Bki_imag = CONST_IMAG(B, k * ldb + i);
149 /* temp1 = alpha * conj(Aki) */
150 BASE temp1_real = alpha_real * Aki_real - alpha_imag * (-Aki_imag);
151 BASE temp1_imag = alpha_real * (-Aki_imag) + alpha_imag * Aki_real;
152 /* temp2 = conj(alpha) * conj(Bki) */
153 BASE temp2_real = alpha_real * Bki_real - alpha_imag * Bki_imag;
154 BASE temp2_imag = -(alpha_real * Bki_imag + alpha_imag * Bki_real);
156 /* Cii += alpha * conj(Aki) * Bki + conj(alpha) * conj(Bki) * Aki */
158 REAL(C, i * lda + i) += 2 * (temp1_real * Bki_real - temp1_imag * Bki_imag);
159 IMAG(C, i * lda + i) = 0.0;
162 for (j = i + 1; j < N; j++) {
163 BASE Akj_real = CONST_REAL(A, k * lda + j);
164 BASE Akj_imag = CONST_IMAG(A, k * lda + j);
165 BASE Bkj_real = CONST_REAL(B, k * ldb + j);
166 BASE Bkj_imag = CONST_IMAG(B, k * ldb + j);
167 /* Cij += alpha * conj(Aki) * Bkj + conj(alpha) * conj(Bki) * Akj */
168 REAL(C, i * lda + j) += (temp1_real * Bkj_real - temp1_imag * Bkj_imag)
169 + (temp2_real * Akj_real - temp2_imag * Akj_imag);
170 IMAG(C, i * lda + j) += (temp1_real * Bkj_imag + temp1_imag * Bkj_real)
171 + (temp2_real * Akj_imag + temp2_imag * Akj_real);
176 } else if (uplo == CblasLower && trans == CblasNoTrans) {
178 for (i = 0; i < N; i++) {
180 /* Cij += alpha Aik conj(Bjk) + conj(alpha) Bik conj(Ajk) */
182 for (j = 0; j < i; j++) {
183 BASE temp_real = 0.0;
184 BASE temp_imag = 0.0;
185 for (k = 0; k < K; k++) {
186 const BASE Aik_real = CONST_REAL(A, i * lda + k);
187 const BASE Aik_imag = CONST_IMAG(A, i * lda + k);
188 /* temp1 = alpha * Aik */
189 const BASE temp1_real = alpha_real * Aik_real - alpha_imag * Aik_imag;
190 const BASE temp1_imag = alpha_real * Aik_imag + alpha_imag * Aik_real;
191 const BASE Bik_real = CONST_REAL(B, i * ldb + k);
192 const BASE Bik_imag = CONST_IMAG(B, i * ldb + k);
194 const BASE Ajk_real = CONST_REAL(A, j * lda + k);
195 const BASE Ajk_imag = CONST_IMAG(A, j * lda + k);
196 /* temp2 = alpha * Ajk */
197 const BASE temp2_real = alpha_real * Ajk_real - alpha_imag * Ajk_imag;
198 const BASE temp2_imag = alpha_real * Ajk_imag + alpha_imag * Ajk_real;
199 const BASE Bjk_real = CONST_REAL(B, j * ldb + k);
200 const BASE Bjk_imag = CONST_IMAG(B, j * ldb + k);
202 /* Cij += alpha * Aik * conj(Bjk) + conj(alpha) * Bik * conj(Ajk) */
203 temp_real += ((temp1_real * Bjk_real + temp1_imag * Bjk_imag)
204 + (Bik_real * temp2_real + Bik_imag * temp2_imag));
205 temp_imag += ((temp1_real * (-Bjk_imag) + temp1_imag * Bjk_real)
206 + (Bik_real * (-temp2_imag) + Bik_imag * temp2_real));
208 REAL(C, i * ldc + j) += temp_real;
209 IMAG(C, i * ldc + j) += temp_imag;
212 /* Cii += alpha Aik conj(Bik) + conj(alpha) Bik conj(Aik) */
214 BASE temp_real = 0.0;
215 /* BASE temp_imag = 0.0; */
216 for (k = 0; k < K; k++) {
217 const BASE Aik_real = CONST_REAL(A, i * lda + k);
218 const BASE Aik_imag = CONST_IMAG(A, i * lda + k);
219 /* temp1 = alpha * Aik */
220 const BASE temp1_real = alpha_real * Aik_real - alpha_imag * Aik_imag;
221 const BASE temp1_imag = alpha_real * Aik_imag + alpha_imag * Aik_real;
222 const BASE Bik_real = CONST_REAL(B, i * ldb + k);
223 const BASE Bik_imag = CONST_IMAG(B, i * ldb + k);
224 temp_real += temp1_real * Bik_real + temp1_imag * Bik_imag;
227 REAL(C, i * ldc + i) += 2 * temp_real;
228 IMAG(C, i * ldc + i) = 0.0;
232 } else if (uplo == CblasLower && trans == CblasConjTrans) {
234 for (k = 0; k < K; k++) {
235 for (i = 0; i < N; i++) {
236 BASE Aki_real = CONST_REAL(A, k * lda + i);
237 BASE Aki_imag = CONST_IMAG(A, k * lda + i);
238 BASE Bki_real = CONST_REAL(B, k * ldb + i);
239 BASE Bki_imag = CONST_IMAG(B, k * ldb + i);
240 /* temp1 = alpha * conj(Aki) */
241 BASE temp1_real = alpha_real * Aki_real - alpha_imag * (-Aki_imag);
242 BASE temp1_imag = alpha_real * (-Aki_imag) + alpha_imag * Aki_real;
243 /* temp2 = conj(alpha) * conj(Bki) */
244 BASE temp2_real = alpha_real * Bki_real - alpha_imag * Bki_imag;
245 BASE temp2_imag = -(alpha_real * Bki_imag + alpha_imag * Bki_real);
247 for (j = 0; j < i; j++) {
248 BASE Akj_real = CONST_REAL(A, k * lda + j);
249 BASE Akj_imag = CONST_IMAG(A, k * lda + j);
250 BASE Bkj_real = CONST_REAL(B, k * ldb + j);
251 BASE Bkj_imag = CONST_IMAG(B, k * ldb + j);
252 /* Cij += alpha * conj(Aki) * Bkj + conj(alpha) * conj(Bki) * Akj */
253 REAL(C, i * lda + j) += (temp1_real * Bkj_real - temp1_imag * Bkj_imag)
254 + (temp2_real * Akj_real - temp2_imag * Akj_imag);
255 IMAG(C, i * lda + j) += (temp1_real * Bkj_imag + temp1_imag * Bkj_real)
256 + (temp2_real * Akj_imag + temp2_imag * Akj_real);
259 /* Cii += alpha * conj(Aki) * Bki + conj(alpha) * conj(Bki) * Aki */
261 REAL(C, i * lda + i) += 2 * (temp1_real * Bki_real - temp1_imag * Bki_imag);
262 IMAG(C, i * lda + i) = 0.0;
267 BLAS_ERROR("unrecognized operation");