From c01c390e3e6f12d90679bf4148813b9147db4035 Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Tue, 19 Oct 2021 08:27:57 +0200 Subject: [PATCH] INFO0939-2021 code. --- info0939/README.md | 35 ++++++++++++---- info0939/profiling.c | 27 ++++++++++++ info0939/scalprod.c | 98 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 9 deletions(-) create mode 100644 info0939/profiling.c create mode 100644 info0939/scalprod.c diff --git a/info0939/README.md b/info0939/README.md index d289bee..fb422a2 100644 --- a/info0939/README.md +++ b/info0939/README.md @@ -6,10 +6,15 @@ Prerequisites for the class: Instructions for downloading the example code --- -The git repository is located at [here](https://gitlab.onelab.info/mcicuttin/snippets/info0939) (`https://gitlab.onelab.info/mcicuttin/snippets/info0939`). We will use the following files: -- `intdiv.c` for the `assert()` example on the integer division program -- `stack_corrupt.c` for a debugging exercise -- `cache.c` for a profiling exercise +The git repository is located at [here](https://gitlab.onelab.info/mcicuttin/snippets/info0939) (`https://gitlab.onelab.info/mcicuttin/snippets/info0939`). We will use the following files (in that order): +- `intdiv.c` for the `assert()` example on the integer division program +- `selectionsort.c` first example of GDB usage (stepping, breakpointing & inspecting) +- `backtrace.c` second example of GDB usage (backtrace) +- `insertionsort.c` first debugging exercise +- `stack_corrupt.c` second debugging exercise +- `profiling.c` for a trivial example of use of `gprof` +- `cache.c` for an exaple of use of `cachegrind` +- `scalprod.c` profiling exercise You can download the files on your machine or on NIC5 via `wget`: @@ -18,9 +23,7 @@ You can download the files on your machine or on NIC5 via `wget`: export REPO_BASE=https://gitlab.onelab.info/mcicuttin/snippets/-/raw/master/info0939 # Files are then donwloaded as following: -wget $REPO_BASE/intdiv.c -wget $REPO_BASE/stack_corrupt.c -wget $REPO_BASE/cache.c +wget $REPO_BASE/filename.c ``` GDB Cheatsheet @@ -35,6 +38,7 @@ To debug with GDB: `gdb <progname>`, then * `c`: continue until the end of the program or the next breakpoint * `break <file:line>`: set a breakpoint * `p`: print the value of a variable +* `x/ct <address>` examine the contents of memory, where `c` is the count and `t` is the type. `t` takes more or less the same values of the `printf()` format specifiers. Example: `x/8d array` will print 8 integers starting from the address contained in `array` Example: Integer division & `assert()` --- @@ -62,7 +66,7 @@ Debugging exercise: `insertionsort.c` * Compile with `gcc -g -o insertionsort insertionsort.c` * Run with `./insertionsort`. What do you see? -* Launch the code in GDB find why it crashes +* Launch the code in GDB and find why it crashes Debugging exercise: `stack_corrupt.c` --- @@ -71,9 +75,22 @@ Debugging exercise: `stack_corrupt.c` * Run with `./stack_corrupt`. What do you see? * Launch the code in GDB and try to explain what you see +Profiling example with `profiling.c` +--- +Example usage of `gprof` + +* Compile with `gcc -g -pg -o profiling profiling.c` +* Run with `gprof profiling` + Profiling example with `cache.c` --- Example usage of cachegrind to find cache-unfriendly code * Compile with `gcc -O3 -g -o cache cache.c` -* Run with `valgrind --tool=cachegrind ./cache` \ No newline at end of file +* Run with `valgrind --tool=cachegrind ./cache` + +Profiling exercise (guided) with `scalprod.c` +--- + +* [Benchmark](https://www.cpubenchmark.net/cpu.php?cpu=AMD+EPYC+7542&id=3604) +* [Specs](https://en.wikichip.org/wiki/amd/epyc/7542) \ No newline at end of file diff --git a/info0939/profiling.c b/info0939/profiling.c new file mode 100644 index 0000000..eb2f410 --- /dev/null +++ b/info0939/profiling.c @@ -0,0 +1,27 @@ +#include <stdio.h> +#include <stdint.h> + +static uint64_t +fib(uint32_t n) +{ + if (n < 2) + return 1; + + return fib(n-1) + fib(n-2); +} + +static uint64_t +fact(uint32_t n) +{ + if (n > 0) + return n*fact(n-1); + + return 1; +} + +int main(void) +{ + printf("%llu\n", fib(42)); + printf("%llu\n", fact(42)); + return 0; +} diff --git a/info0939/scalprod.c b/info0939/scalprod.c new file mode 100644 index 0000000..18f0788 --- /dev/null +++ b/info0939/scalprod.c @@ -0,0 +1,98 @@ +#include <stdio.h> +#include <sys/resource.h> +#include <cblas.h> +#include <immintrin.h> +#include <stdlib.h> + +#define VEC_SIZE 320000000ULL +#define BLOCK_SIZE 8 + +static void * +Alloc_doubles(size_t size) +{ + void *ret = aligned_alloc(32, size*sizeof(double)); + if (!ret) + { + printf("malloc() failed\n"); + abort(); + } + return ret; +} + +static double +compute_time(const struct rusage *rstart, const struct rusage *rend) +{ + double ret = rend->ru_utime.tv_sec + rend->ru_utime.tv_usec/1e6; + ret -= rstart->ru_utime.tv_sec + rstart->ru_utime.tv_usec/1e6; + return ret; +} + +static double +scal(const double * __restrict__ a, const double * __restrict__ b, size_t len) +{ + double ret = 0.0; + size_t blocks = len/BLOCK_SIZE; + + for (size_t blk = 0; blk < blocks; blk++) + for (size_t j = 0; j < BLOCK_SIZE; j++) + ret += a[blk*BLOCK_SIZE+j]*b[blk*BLOCK_SIZE+j]; + + for (size_t i = blocks*BLOCK_SIZE; i < len; i++) + ret += a[i]*b[i]; + + return ret; +} + +static double +scal_avx(const double * __restrict__ a, const double * __restrict__ b, size_t len) +{ + double ret = 0.0; + size_t blocks = len/4; + for (size_t blk = 0; blk < blocks; blk++) + { + __m256d av = _mm256_load_pd(a+blk*4); + __m256d bv = _mm256_load_pd(b+blk*4); + __m256d mv = _mm256_mul_pd(av, bv); + __m128d lowv = _mm256_castpd256_pd128(mv); + __m128d highv = _mm256_extractf128_pd(mv, 1); + lowv = _mm_add_pd(lowv, highv); + __m128d high64 = _mm_unpackhi_pd(lowv, lowv); + ret += _mm_cvtsd_f64(_mm_add_sd(lowv, high64)); + } + + for (size_t i = blocks*4; i < len; i++) + ret += a[i]*b[i]; + + return ret; +} + +int main(void) +{ + double *a = (double *) Alloc_doubles(VEC_SIZE); + double *b = (double *) Alloc_doubles(VEC_SIZE); + + for (size_t i = 0; i < VEC_SIZE; i++) + { + a[i] = ((double) i)/VEC_SIZE; + b[i] = ((double) i)/VEC_SIZE; + } + + struct rusage rstart, rend; + getrusage(RUSAGE_SELF, &rstart); + //double s = scal(a, b, VEC_SIZE); + double s = scal_avx(a, b, VEC_SIZE); + //double s = cblas_ddot(VEC_SIZE, a, 1, b, 1); + getrusage(RUSAGE_SELF, &rend); + + printf("%lg\n", s); + + double time = compute_time(&rstart, &rend); + double gflops = 2*VEC_SIZE/(time*1e9); + double gbs = 2*VEC_SIZE*sizeof(double)/(time*1e9); + + printf("GFLOPS/s: %lg\n", gflops); + printf("GB/s: %lg\n", gbs); + + return 0; +} + -- GitLab