pulp-platform · mbertuletti · Jul 13, 2022 · Jul 13, 2022 · Jul 14, 2022 · Jul 18, 2022
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Use custom compiler for VCS specified with `CC` and `CCX` environment variable
 - Implement operand gating for SIMD and MAC Units in Snitch IPU's DSP Unit
 - Add Channel Estimation application and kernels
+- Add Gauss-Jordan matrix inversion kernel
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`

@@ -0,0 +1,95 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define FIXED_POINT 16
+#define FIX_DIV(a, b) ((int32_t)((a << FIXED_POINT) / b))
+#define FIX_MUL(a, b) ((int32_t)((a * b) >> FIXED_POINT))
+#define MIN(a, b) (a < b ? a : b)
+
+dump(l, 1);
+dump(loopCnt, 2);
+dump(i, 3);
+
+void display(int32_t *A, int32_t n, int32_t m);
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m);
+#endif
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m);
+
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+ int32_t n, int32_t m, int32_t o);
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+ int32_t a, int32_t b, int32_t c, uint32_t core_id);
+
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+ uint32_t core_id);
+
+void display(int32_t *A, int32_t n, int32_t m) {
+ int32_t i;
+ for (i = 0; i < n * m; i++) {
+ printf("Output[%d] = %8d\n", i, A[i]);
+ }
+}
+
+#ifdef FOLDED
+void display_folded(int32_t *A, int32_t n, int32_t m) {
+ int32_t i, j, k, shift;
+ for (i = 0; i < n * m; i++) {
+ k = i / n;
+ j = i % n;
+ shift = N_BANKS * ((k * n) / N_USED_BANKS) + (k * n) % N_USED_BANKS;
+ printf("Output[%d] = %8d\n", i, A[shift + j]);
+ }
+}
+#endif
+
+void Transpose(int32_t *matrix, int32_t *t_matrix, int32_t n, int32_t m) {
+ int32_t i, j;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < m; j++) {
+ t_matrix[j * n + i] = matrix[i * m + j];
+ }
+ }
+}
+
+void MatrixMult(int32_t *matrix_1, int32_t *matrix_2, int32_t *matrix_product,
+ int32_t n, int32_t m, int32_t o) {
+ int32_t i, j, k;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < o; j++) {
+ matrix_product[i * o + j] = 0;
+ for (k = 0; k < m; k++) {
+ matrix_product[i * o + j] +=
+ FIX_MUL(matrix_1[i * m + k], matrix_2[k * o + j]);
+ }
+ }
+ }
+}
+
+void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+ int32_t a, int32_t b, int32_t c, uint32_t core_id) {
+ if (core_id == 0) {
+ for (uint32_t j = 0; j < num_rows; j++) {
+ for (uint32_t i = 0; i < num_columns; i++) {
+ matrix[j * num_columns + i] = a * (int32_t)i + b * (int32_t)j + c;
+ }
+ }
+ }
+}
+
+void init_matrix_zeros(int32_t *matrix, uint32_t num_rows, uint32_t num_columns,
+ uint32_t core_id) {
+ if (core_id == 0) {
+ for (uint32_t i = 0; i < num_columns; i++) {
+ for (uint32_t j = 0; j < num_rows; j++) {
+ matrix[j * num_columns + i] = 0;
+ }
+ }
+ }
+}
@@ -0,0 +1,105 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#define N 16
+#define M 16
+#define N_BANKS (1024)
+#define N_USED_BANKS (16)
+
+#define VERBOSE
+#define SINGLE
+// #define PARALLEL
+// #define FOLDED
+
+#include "initialization.h"
+#include "kernel/mempool_mat_inv_q32p.h"
+#include "kernel/mempool_mat_inv_q32s.h"
+
+#ifdef FOLDED
+int32_t matrix[N * M] __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t folded_matrix[N_BANKS * ((N * M) / N_USED_BANKS)]
+ __attribute__((aligned(N_BANKS), section(".l1")));
+int32_t inv[N_BANKS * ((N * M) / N_USED_BANKS)]
+ __attribute__((aligned(N_BANKS), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
+#else
+int32_t matrix[N * M] __attribute__((aligned(N), section(".l1")));
+int32_t inv[M * M] __attribute__((aligned(N), section(".l1")));
+uint32_t flag __attribute__((section(".l1")));
+#endif
+
+int main() {
+
+ uint32_t core_id = mempool_get_core_id();
+ uint32_t num_cores = mempool_get_core_count();
+ // Initialize barrier and synchronize
+ mempool_barrier_init(core_id);
+
+/* initialize the data */
+#if defined(SINGLE) || defined(PARALLEL)
+ init_matrix(matrix, N, M, -156, 427, -219, core_id);
+ init_matrix_zeros(inv, M, M, core_id);
+ if (core_id == 0) {
+ flag = 0U;
+ }
+ mempool_barrier(num_cores);
+
+#elif defined(FOLDED)
+ uint32_t nPE = N_USED_BANKS >> 2U;
+ init_matrix(matrix, N, M, -156, 427, -219, core_id);
+ init_matrix_zeros(folded_matrix, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+ init_matrix_zeros(inv, ((N * M) / N_USED_BANKS), N_BANKS, core_id);
+ if (core_id == 0) {
+ flag = 0U;
+ }
+ mempool_barrier(num_cores);
+
+#endif
+
+/* Execute the kernel */
+#if defined(SINGLE)
+ if (core_id == 0) {
+ mempool_start_benchmark();
+ mempool_GJinv_q32s(matrix, inv, M);
+ mempool_stop_benchmark();
+ }
+ mempool_barrier(num_cores);
+
+#elif defined(PARALLEL)
+ if (core_id < MIN(NUM_CORES, N / 4)) {
+ mempool_start_benchmark();
+ mempool_GJinv_q32p(matrix, inv, M, &flag);
+ mempool_stop_benchmark();
+ }
+ mempool_barrier(num_cores);
+
+#elif defined(FOLDED)
+ mempool_start_benchmark();
+ fold_matrix(matrix, folded_matrix, N);
+ mempool_stop_benchmark();
+ if (core_id < nPE) {
+ mempool_start_benchmark();
+ mempool_GJinv_folded_q32p(folded_matrix, inv, M, &flag, nPE);
+ mempool_stop_benchmark();
+ }
+ mempool_barrier(num_cores);
+
+#endif
+
+/* Display the result of computation */
+#ifdef VERBOSE
+ if (core_id == 0)
+ display(inv, M, N);
+ mempool_barrier(num_cores);
+#endif
+
+ return 0;
+}