Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: speedup fourierradon with engine=cuda #627

Merged
merged 2 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions pylops/signalprocessing/_fourierradon2d_cuda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from cmath import exp
from math import pi

import cupy as cp
from numba import cuda

TWO_PI_MINUS = cp.float32(-2.0 * pi)
TWO_PI_PLUS = cp.float32(2.0 * pi)
IMG = cp.complex64(1j)


@cuda.jit
def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
Expand All @@ -16,7 +20,11 @@ def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
ih, ifr = cuda.grid(2)
if ih < nh and ifr >= flim0 and ifr <= flim1:
for ipx in range(npx):
y[ih, ifr] += x[ipx, ifr] * exp(-1j * 2 * pi * f[ifr] * px[ipx] * h[ih])
# slow computation of exp(1j * x)
# y[ih, ifr] += x[ipx, ifr] * exp(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih])
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih])
y[ih, ifr] += x[ipx, ifr] * (c + IMG * s)


@cuda.jit
Expand All @@ -31,7 +39,11 @@ def _aradon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
ipx, ifr = cuda.grid(2)
if ipx < npx and ifr >= flim0 and ifr <= flim1:
for ih in range(nh):
x[ipx, ifr] += y[ih, ifr] * exp(1j * 2 * pi * f[ifr] * px[ipx] * h[ih])
# slow computation of exp(1j * x)
# x[ipx, ifr] += y[ih, ifr] * exp(TWO_PI_I_PLUS * f[ifr] * px[ipx] * h[ih])
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(TWO_PI_PLUS * f[ifr] * px[ipx] * h[ih])
x[ipx, ifr] += y[ih, ifr] * (c + IMG * s)


def _radon_inner_2d_cuda(
Expand Down
25 changes: 20 additions & 5 deletions pylops/signalprocessing/_fourierradon3d_cuda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from cmath import exp
from math import pi

import cupy as cp
from numba import cuda

TWO_PI_MINUS = cp.float32(-2.0 * pi)
TWO_PI_PLUS = cp.float32(2.0 * pi)
IMG = cp.complex64(1j)


@cuda.jit
def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy, nhx):
Expand All @@ -17,9 +21,15 @@ def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy,
if ihy < nhy and ihx < nhx and ifr >= flim0 and ifr <= flim1:
for ipy in range(npy):
for ipx in range(npx):
y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp(
-1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# slow computation of exp(1j * x)
# y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp(
# TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# )
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(
TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
)
y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * (c + IMG * s)


@cuda.jit
Expand All @@ -35,9 +45,14 @@ def _aradon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy
if ipy < npy and ipx < npx and ifr >= flim0 and ifr <= flim1:
for ihy in range(nhy):
for ihx in range(nhx):
x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp(
1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# slow computation of exp(1j * x)
# x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp(
# TWO_PI_I_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# )
s, c = cuda.libdevice.sincosf(
TWO_PI_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
)
x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * (c + IMG * s)


def _radon_inner_3d_cuda(
Expand Down
6 changes: 4 additions & 2 deletions pylops/signalprocessing/fourierradon2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from pylops.utils.typing import DTypeLike, NDArray

jit_message = deps.numba_import("the radon2d module")
cupy_message = deps.cupy_import("the radon2d module")

if jit_message is None:
from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda
from ._fourierradon2d_numba import _aradon_inner_2d, _radon_inner_2d
if jit_message is None and cupy_message is None:
from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING)

Expand Down Expand Up @@ -167,7 +169,7 @@ def __init__(
self._register_multiplications(engine)

def _register_multiplications(self, engine: str) -> None:
if engine == "numba" and jit_message is None:
if engine == "numba":
self._matvec = self._matvec_numba
self._rmatvec = self._rmatvec_numba
elif engine == "cuda":
Expand Down
6 changes: 4 additions & 2 deletions pylops/signalprocessing/fourierradon3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from pylops.utils.typing import DTypeLike, NDArray

jit_message = deps.numba_import("the radon2d module")
cupy_message = deps.cupy_import("the radon2d module")

if jit_message is None:
from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda
from ._fourierradon3d_numba import _aradon_inner_3d, _radon_inner_3d
if jit_message is None and cupy_message is None:
from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING)

Expand Down Expand Up @@ -192,7 +194,7 @@ def __init__(
self._register_multiplications(engine)

def _register_multiplications(self, engine: str) -> None:
if engine == "numba" and jit_message is None:
if engine == "numba":
self._matvec = self._matvec_numba
self._rmatvec = self._rmatvec_numba
elif engine == "cuda":
Expand Down
Loading