Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
dg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
gmsh
dg
Commits
d7deccc9
Commit
d7deccc9
authored
4 years ago
by
Matteo Cicuttin
Browse files
Options
Downloads
Patches
Plain Diff
Lifting perf tests
parent
cbf819d4
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
include/kernels_gpu.h
+4
-4
4 additions, 4 deletions
include/kernels_gpu.h
src/entity_data.cpp
+1
-1
1 addition, 1 deletion
src/entity_data.cpp
src/kernels_cuda.cu
+6
-6
6 additions, 6 deletions
src/kernels_cuda.cu
tests/test_lifting_gpu.cpp
+8
-2
8 additions, 2 deletions
tests/test_lifting_gpu.cpp
with
19 additions
and
13 deletions
include/kernels_gpu.h
+
4
−
4
View file @
d7deccc9
...
@@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1>
...
@@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1>
static
const
size_t
cells_per_dblock
=
32
;
static
const
size_t
cells_per_dblock
=
32
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
parallel_dblocks
=
4
;
static
const
size_t
parallel_dblocks
=
1
;
};
};
template
<
>
template
<
>
...
@@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2>
...
@@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2>
static
const
size_t
cells_per_dblock
=
12
;
static
const
size_t
cells_per_dblock
=
12
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
parallel_dblocks
=
4
;
static
const
size_t
parallel_dblocks
=
1
;
};
};
template
<
>
template
<
>
...
@@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3>
...
@@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3>
static
const
size_t
cells_per_dblock
=
6
;
static
const
size_t
cells_per_dblock
=
6
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dofs_per_dblock
=
num_bf
*
cells_per_dblock
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
dblock_size
=
128
;
static
const
size_t
parallel_dblocks
=
4
;
static
const
size_t
parallel_dblocks
=
1
;
};
};
template
<
>
template
<
>
...
@@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
...
@@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
gpuTextureObject_t
F
,
double
*
dF_dx
,
double
*
dF_dy
,
double
*
dF_dz
);
gpuTextureObject_t
F
,
double
*
dF_dx
,
double
*
dF_dy
,
double
*
dF_dz
);
void
void
gpu_compute_flux_lifting
(
entity_data_gpu
&
edg
,
gpuTextureObject_t
f
,
double
*
out
);
gpu_compute_flux_lifting
(
entity_data_gpu
&
edg
,
const
double
*
f
,
double
*
out
);
\ No newline at end of file
\ No newline at end of file
This diff is collapsed.
Click to expand it.
src/entity_data.cpp
+
1
−
1
View file @
d7deccc9
...
@@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
...
@@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
auto
src_row
=
iO
*
ed
.
num_bf
+
i
;
auto
src_row
=
iO
*
ed
.
num_bf
+
i
;
auto
src_col
=
j
;
auto
src_col
=
j
;
auto
dst_row
=
i
;
auto
dst_row
=
i
;
auto
dst_col
=
iO
*
4
*
ed
.
num_fluxes
+
(
((
4
*
ed
.
num_fluxes
-
i
)
+
j
)
%
(
4
*
ed
.
num_fluxes
)
);
auto
dst_col
=
iO
*
4
*
ed
.
num_fluxes
+
j
;
//
( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
lm
(
dst_row
,
dst_col
)
=
ed
.
lifting_matrices
(
src_row
,
src_col
);
lm
(
dst_row
,
dst_col
)
=
ed
.
lifting_matrices
(
src_row
,
src_col
);
}
}
}
}
...
...
This diff is collapsed.
Click to expand it.
src/kernels_cuda.cu
+
6
−
6
View file @
d7deccc9
...
@@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
...
@@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
template
<
size_t
K
>
template
<
size_t
K
>
__global__
void
__global__
void
gpu_lift_planar
(
gpuTextureObject_t
flux
,
gpuTextureObject_t
LM_tex
,
gpu_lift_planar
(
const
double
*
flux
,
gpuTextureObject_t
LM_tex
,
const
double
*
__restrict__
dets
,
double
*
__restrict__
lifted_flux
,
const
double
*
__restrict__
dets
,
double
*
__restrict__
lifted_flux
,
int32_t
num_all_elems
,
int32_t
*
orients
,
int32_t
dof_base
,
int32_t
flux_base
)
int32_t
num_all_elems
,
int32_t
*
orients
,
int32_t
dof_base
,
int32_t
flux_base
)
{
{
...
@@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
...
@@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
for
(
int32_t
dof
=
0
;
dof
<
4
*
KS
::
num_fluxes
;
dof
++
)
for
(
int32_t
dof
=
0
;
dof
<
4
*
KS
::
num_fluxes
;
dof
++
)
{
{
int32_t
l_ofs
=
LM_orient
+
LM_row
+
KS
::
num_bf
*
dof
;
int32_t
l_ofs
=
LM_orient
+
LM_row
+
KS
::
num_bf
*
dof
;
int32_t
f_ofs
=
elem_flux_base
+
(
dof
+
delta
)
%
(
4
*
KS
::
num_fluxes
);
int32_t
f_ofs
=
elem_flux_base
+
dof
;
//
(dof+delta)%(4*KS::num_fluxes);
acc
+=
inv_det
*
fetch_tex
(
LM_tex
,
l_ofs
)
*
fetch_tex
(
flux
,
f_ofs
);
acc
+=
inv_det
*
fetch_tex
(
LM_tex
,
l_ofs
)
*
flux
[
f_ofs
];
//
fetch_tex(flux, f_ofs);
}
}
lifted_flux
[
cur_dof_offset
]
+=
acc
;
lifted_flux
[
cur_dof_offset
]
+=
acc
;
...
@@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
...
@@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
template
<
size_t
K
>
template
<
size_t
K
>
void
void
launch_lift_kernel
(
entity_data_gpu
&
edg
,
gpuTextureObject_t
f
,
double
*
out
)
launch_lift_kernel
(
entity_data_gpu
&
edg
,
const
double
*
f
,
double
*
out
)
{
{
const
auto
THREADS_PER_BLOCK
=
256
;
//kernel_gpu_sizes<K>::deriv_threads;
const
auto
THREADS_PER_BLOCK
=
128
;
//kernel_gpu_sizes<K>::deriv_threads;
auto
num_blocks
=
edg
.
num_bf
*
edg
.
num_all_elems
/
THREADS_PER_BLOCK
;
auto
num_blocks
=
edg
.
num_bf
*
edg
.
num_all_elems
/
THREADS_PER_BLOCK
;
if
(
edg
.
num_bf
*
edg
.
num_all_elems
%
THREADS_PER_BLOCK
)
if
(
edg
.
num_bf
*
edg
.
num_all_elems
%
THREADS_PER_BLOCK
)
num_blocks
+=
1
;
num_blocks
+=
1
;
...
@@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
...
@@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
}
}
void
void
gpu_compute_flux_lifting
(
entity_data_gpu
&
edg
,
gpuTextureObject_t
f
,
double
*
out
)
gpu_compute_flux_lifting
(
entity_data_gpu
&
edg
,
const
double
*
f
,
double
*
out
)
{
{
switch
(
edg
.
a_order
)
switch
(
edg
.
a_order
)
{
{
...
...
This diff is collapsed.
Click to expand it.
tests/test_lifting_gpu.cpp
+
8
−
2
View file @
d7deccc9
...
@@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order)
...
@@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order)
edgs
.
push_back
(
std
::
move
(
edg
)
);
edgs
.
push_back
(
std
::
move
(
edg
)
);
}
}
texture_alloca
tor
<
double
>
PFdotn_gpu
(
PFdotn
.
data
(),
PFdotn
.
size
());
device_vec
tor
<
double
>
PFdotn_gpu
(
PFdotn
.
data
(),
PFdotn
.
size
());
device_vector
<
double
>
LiftF_gpu
(
LiftF
.
data
(),
LiftF
.
size
());
device_vector
<
double
>
LiftF_gpu
(
LiftF
.
data
(),
LiftF
.
size
());
for
(
auto
&
edg
:
edgs
)
for
(
auto
&
edg
:
edgs
)
{
{
timecounter_gpu
tc
;
timecounter_gpu
tc
;
tc
.
tic
();
tc
.
tic
();
gpu_compute_flux_lifting
(
edg
,
PFdotn_gpu
.
get_texture
(),
LiftF_gpu
.
data
());
gpu_compute_flux_lifting
(
edg
,
PFdotn_gpu
.
data
(),
LiftF_gpu
.
data
());
double
time
=
tc
.
toc
();
double
time
=
tc
.
toc
();
auto
num_cells
=
edg
.
num_all_elems
;
auto
num_cells
=
edg
.
num_all_elems
;
...
@@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order)
...
@@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order)
std
::
cout
<<
"Kernel runtime: "
<<
time
<<
" seconds. Estimated performance: "
;
std
::
cout
<<
"Kernel runtime: "
<<
time
<<
" seconds. Estimated performance: "
;
double
flops
=
3
*
(
edg
.
num_bf
)
*
4
*
edg
.
num_fluxes
*
num_cells
;
double
flops
=
3
*
(
edg
.
num_bf
)
*
4
*
edg
.
num_fluxes
*
num_cells
;
std
::
cout
<<
flops
/
(
1e9
*
time
)
<<
" GFlops/s"
<<
std
::
endl
;
std
::
cout
<<
flops
/
(
1e9
*
time
)
<<
" GFlops/s"
<<
std
::
endl
;
auto
read_gbs
=
8
*
4
*
edg
.
num_fluxes
*
num_cells
/
(
1e9
*
time
);
auto
write_gbs
=
8
*
edg
.
num_bf
*
num_cells
/
(
1e9
*
time
);
auto
tot_gbs
=
read_gbs
+
write_gbs
;
std
::
cout
<<
"Read: "
<<
read_gbs
<<
" GB/s, write: "
<<
write_gbs
;
std
::
cout
<<
" GB/s, total: "
<<
tot_gbs
<<
" GB/s"
<<
std
::
endl
;
}
}
else
else
{
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment