From c01c390e3e6f12d90679bf4148813b9147db4035 Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Tue, 19 Oct 2021 08:27:57 +0200
Subject: [PATCH] INFO0939-2021 code.

---
 info0939/README.md   | 35 ++++++++++++----
 info0939/profiling.c | 27 ++++++++++++
 info0939/scalprod.c  | 98 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 9 deletions(-)
 create mode 100644 info0939/profiling.c
 create mode 100644 info0939/scalprod.c

diff --git a/info0939/README.md b/info0939/README.md
index d289bee..fb422a2 100644
--- a/info0939/README.md
+++ b/info0939/README.md
@@ -6,10 +6,15 @@ Prerequisites for the class:
 
 Instructions for downloading the example code
 ---
-The git repository is located at [here](https://gitlab.onelab.info/mcicuttin/snippets/info0939) (`https://gitlab.onelab.info/mcicuttin/snippets/info0939`). We will use the following files:  
-- `intdiv.c` for the `assert()` example on the integer division program  
-- `stack_corrupt.c` for a debugging exercise  
-- `cache.c` for a profiling exercise  
+The git repository is located at [here](https://gitlab.onelab.info/mcicuttin/snippets/info0939) (`https://gitlab.onelab.info/mcicuttin/snippets/info0939`). We will use the following files (in that order):  
+- `intdiv.c` for the `assert()` example on the integer division program   
+- `selectionsort.c` first example of GDB usage (stepping, breakpointing & inspecting)  
+- `backtrace.c` second example of GDB usage (backtrace)  
+- `insertionsort.c` first debugging exercise  
+- `stack_corrupt.c` second debugging exercise  
+- `profiling.c` for a trivial example of use of `gprof`  
+- `cache.c` for an exaple of use of `cachegrind`    
+- `scalprod.c` profiling exercise  
 
 You can download the files on your machine or on NIC5 via `wget`:
 
@@ -18,9 +23,7 @@ You can download the files on your machine or on NIC5 via `wget`:
 export REPO_BASE=https://gitlab.onelab.info/mcicuttin/snippets/-/raw/master/info0939
 
 # Files are then donwloaded as following:
-wget $REPO_BASE/intdiv.c
-wget $REPO_BASE/stack_corrupt.c
-wget $REPO_BASE/cache.c
+wget $REPO_BASE/filename.c
 ```
 
 GDB Cheatsheet
@@ -35,6 +38,7 @@ To debug with GDB: `gdb <progname>`, then
 * `c`: continue until the end of the program or the next breakpoint
 * `break <file:line>`: set a breakpoint
 * `p`: print the value of a variable
+* `x/ct <address>` examine the contents of memory, where `c` is the count and `t` is the type. `t` takes more or less the same values of the `printf()` format specifiers. Example: `x/8d array` will print 8 integers starting from the address contained in `array`
 
 Example: Integer division & `assert()`
 ---
@@ -62,7 +66,7 @@ Debugging exercise: `insertionsort.c`
 
 * Compile with `gcc -g -o insertionsort insertionsort.c`
 * Run with `./insertionsort`. What do you see?
-* Launch the code in GDB find why it crashes
+* Launch the code in GDB and find why it crashes
 
 Debugging exercise: `stack_corrupt.c`
 ---
@@ -71,9 +75,22 @@ Debugging exercise: `stack_corrupt.c`
 * Run with `./stack_corrupt`. What do you see?
 * Launch the code in GDB and try to explain what you see
 
+Profiling example with `profiling.c`
+---
+Example usage of `gprof`
+
+* Compile with `gcc -g -pg -o profiling profiling.c`
+* Run with `gprof profiling`
+
 Profiling example with `cache.c`
 ---
 Example usage of cachegrind to find cache-unfriendly code
 
 * Compile with `gcc -O3 -g -o cache cache.c`
-* Run with `valgrind --tool=cachegrind ./cache`
\ No newline at end of file
+* Run with `valgrind --tool=cachegrind ./cache`
+
+Profiling exercise (guided) with `scalprod.c`
+---
+
+* [Benchmark](https://www.cpubenchmark.net/cpu.php?cpu=AMD+EPYC+7542&id=3604)  
+* [Specs](https://en.wikichip.org/wiki/amd/epyc/7542)  
\ No newline at end of file
diff --git a/info0939/profiling.c b/info0939/profiling.c
new file mode 100644
index 0000000..eb2f410
--- /dev/null
+++ b/info0939/profiling.c
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <stdint.h>
+
+static uint64_t
+fib(uint32_t n)
+{
+    if (n < 2)
+        return 1;
+
+    return fib(n-1) + fib(n-2);
+}
+
+static uint64_t
+fact(uint32_t n)
+{
+    if (n > 0)
+        return n*fact(n-1);
+
+    return 1;
+}
+
+int main(void)
+{
+    printf("%llu\n", fib(42));
+    printf("%llu\n", fact(42));
+    return 0;
+}
diff --git a/info0939/scalprod.c b/info0939/scalprod.c
new file mode 100644
index 0000000..18f0788
--- /dev/null
+++ b/info0939/scalprod.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <sys/resource.h>
+#include <cblas.h>
+#include <immintrin.h>
+#include <stdlib.h>
+
+#define VEC_SIZE 320000000ULL
+#define BLOCK_SIZE 8
+
+static void *
+Alloc_doubles(size_t size)
+{
+    void *ret = aligned_alloc(32, size*sizeof(double));
+    if (!ret)
+    {
+        printf("malloc() failed\n");
+        abort();
+    }
+    return ret;
+}
+
+static double
+compute_time(const struct rusage *rstart, const struct rusage *rend)
+{
+    double ret = rend->ru_utime.tv_sec + rend->ru_utime.tv_usec/1e6;
+    ret -= rstart->ru_utime.tv_sec + rstart->ru_utime.tv_usec/1e6;
+    return ret;
+}
+
+static double
+scal(const double * __restrict__ a, const double * __restrict__ b, size_t len)
+{
+    double ret = 0.0;
+    size_t blocks = len/BLOCK_SIZE;
+
+    for (size_t blk = 0; blk < blocks; blk++)
+        for (size_t j = 0; j < BLOCK_SIZE; j++)
+            ret += a[blk*BLOCK_SIZE+j]*b[blk*BLOCK_SIZE+j];
+
+    for (size_t i = blocks*BLOCK_SIZE; i < len; i++)
+        ret += a[i]*b[i];
+
+    return ret;
+}
+
+static double
+scal_avx(const double * __restrict__ a, const double * __restrict__ b, size_t len)
+{
+    double ret = 0.0;
+    size_t blocks = len/4;
+    for (size_t blk = 0; blk < blocks; blk++)
+    {
+        __m256d av      = _mm256_load_pd(a+blk*4);
+        __m256d bv      = _mm256_load_pd(b+blk*4);
+        __m256d mv      = _mm256_mul_pd(av, bv);
+        __m128d lowv    = _mm256_castpd256_pd128(mv);
+        __m128d highv   = _mm256_extractf128_pd(mv, 1);
+                lowv    = _mm_add_pd(lowv, highv);
+        __m128d high64  = _mm_unpackhi_pd(lowv, lowv);
+        ret +=  _mm_cvtsd_f64(_mm_add_sd(lowv, high64));
+    }
+
+    for (size_t i = blocks*4; i < len; i++)
+        ret += a[i]*b[i];
+
+    return ret;
+}
+
+int main(void)
+{
+    double *a = (double *) Alloc_doubles(VEC_SIZE);
+    double *b = (double *) Alloc_doubles(VEC_SIZE);
+
+    for (size_t i = 0; i < VEC_SIZE; i++)
+    {
+        a[i] = ((double) i)/VEC_SIZE;
+        b[i] = ((double) i)/VEC_SIZE;
+    }
+
+    struct rusage rstart, rend;
+    getrusage(RUSAGE_SELF, &rstart);
+    //double s = scal(a, b, VEC_SIZE);
+    double s = scal_avx(a, b, VEC_SIZE);
+    //double s = cblas_ddot(VEC_SIZE, a, 1, b, 1);
+    getrusage(RUSAGE_SELF, &rend);
+
+    printf("%lg\n", s);
+
+    double time = compute_time(&rstart, &rend);
+    double gflops = 2*VEC_SIZE/(time*1e9);
+    double gbs = 2*VEC_SIZE*sizeof(double)/(time*1e9);
+
+    printf("GFLOPS/s: %lg\n", gflops);
+    printf("GB/s: %lg\n", gbs);
+
+    return 0;
+}
+
-- 
GitLab