Skip to content

Commit

Permalink
#3504 manage the cuda context outside the nvjpeg decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
totaam committed Apr 2, 2022
1 parent d8f6d35 commit 2f1c3ab
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 79 deletions.
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2289,9 +2289,9 @@ def nvcc_compile(cmd):
add_cython_ext("xpra.codecs.nvjpeg.encoder",
["xpra/codecs/nvjpeg/encoder.pyx"],
**nvjpeg_pkgconfig)
#add_cython_ext("xpra.codecs.nvjpeg.decoder",
# ["xpra/codecs/nvjpeg/decoder.pyx"],
# **nvjpeg_pkgconfig)
add_cython_ext("xpra.codecs.nvjpeg.decoder",
["xpra/codecs/nvjpeg/decoder.pyx"],
**nvjpeg_pkgconfig)

jpeg = jpeg_decoder_ENABLED or jpeg_encoder_ENABLED
toggle_packages(jpeg, "xpra.codecs.jpeg")
Expand Down
21 changes: 8 additions & 13 deletions xpra/client/gl/gl_window_backing_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
glGetString, glViewport, glMatrixMode, glLoadIdentity, glOrtho,
glGenTextures, glDisable,
glBindTexture, glPixelStorei, glEnable, glBegin, glFlush,
glBindBuffer, glGenBuffers, glGetBufferParameteriv, glBufferData,
glBindBuffer, glGenBuffers, glGetBufferParameteriv, glBufferData, glDeleteBuffers,
glTexParameteri,
glTexImage2D,
glMultiTexCoord2i,
Expand Down Expand Up @@ -579,6 +579,7 @@ def close_gl_config(self):
"""

def close(self):
self.free_cuda_context()
self.close_gl_config()
#This seems to cause problems, so we rely
#on destroying the context to clear textures and fbos...
Expand Down Expand Up @@ -1045,20 +1046,15 @@ def paint_nvjpeg(gl_context):
rgb_format = "RGB"

self.gl_init()

from xpra.codecs.cuda_common.cuda_context import cuda_device_context
from xpra.codecs.nvjpeg.decoder import get_default_device
from pycuda.driver import memcpy_dtod
from pycuda.gl import RegisteredBuffer, graphics_map_flags

pbo = glGenBuffers(1)
def copy_buffer(buf, size):
log("copy_buffer(%s, %s)", buf, size)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo)
glBufferData(GL_PIXEL_UNPACK_BUFFER, size, None, GL_STREAM_DRAW)
bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE)
assert bsize==size, "expected size %i but got %i" % (size, bsize)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)
#import-outside-toplevel
from pycuda.driver import memcpy_dtod #pylint: disable=no-name-in-module
from pycuda.gl import RegisteredBuffer, graphics_map_flags
cuda_pbo = RegisteredBuffer(int(pbo), graphics_map_flags.WRITE_DISCARD)
log("RegisteredBuffer%s=%s", (pbo, graphics_map_flags.WRITE_DISCARD), cuda_pbo)
mapping = cuda_pbo.map()
Expand All @@ -1067,10 +1063,8 @@ def copy_buffer(buf, size):
memcpy_dtod(ptr, buf, size)
mapping.unmap()

#create an OpenGL compatible context:
dev = get_default_device()
gldev = cuda_device_context(dev.device_id, dev.device, True)
img = self.nvjpeg_decoder.decompress_with_device(gldev, rgb_format, img_data, None, copy_buffer)
with self.assign_cuda_context(True):
img = self.nvjpeg_decoder.decompress_with_device(rgb_format, img_data, None, copy_buffer)
log("paint_nvjpeg(%s) img=%s, updating fbo", gl_context, img)

target = GL_TEXTURE_RECTANGLE_ARB
Expand Down Expand Up @@ -1104,6 +1098,7 @@ def copy_buffer(buf, size):
self.present_fbo(x, y, width, height, options.intget("flush", 0))
# present_fbo has reset state already
fire_paint_callbacks(callbacks)
glDeleteBuffers(1, [pbo])

self.idle_add(self.with_gl_context, paint_nvjpeg)
return
Expand Down
25 changes: 23 additions & 2 deletions xpra/client/window_backing_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def __init__(self, wid : int, window_alpha : bool):
self.spng_decoder = get_codec("dec_spng")
self.avif_decoder = get_codec("dec_avif")
self.nvjpeg_decoder = get_codec("dec_nvjpeg")
self.cuda_context = None
self.draw_needs_refresh = True
self.repaint_all = REPAINT_ALL
self.mmap = None
Expand Down Expand Up @@ -370,8 +371,27 @@ def south_y():
# pass
return x, y

def assign_cuda_context(self, opengl=False):
if self.cuda_context is None:
from xpra.codecs.nvjpeg.decoder import get_default_device # @NoMove pylint: disable=no-name-in-module, import-outside-toplevel
dev = get_default_device()
assert dev
#make this an opengl compatible context:
from xpra.codecs.cuda_common.cuda_context import cuda_device_context
self.cuda_context = cuda_device_context(dev.device_id, dev.device, opengl)
#create the context now as this is the part that takes time:
self.cuda_context.make_context()
return self.cuda_context


def free_cuda_context(self):
cc = self.cuda_context
if cc:
self.cuda_context = None
cc.free()

def close(self):
self.free_cuda_context()
self.cancel_fps_refresh()
self._backing = None
log("%s.close() video_decoder=%s", self, self._video_decoder)
Expand Down Expand Up @@ -444,9 +464,10 @@ def paint_jpega(self, img_data, x, y, width, height, options, callbacks):

def do_paint_jpeg(self, rgb_format, img_data, x, y, width, height, options, callbacks):
alpha_offset = options.intget("alpha-offset", 0)
log.info("do_paint_jpeg: nvjpeg_decoder=%s", self.nvjpeg_decoder)
log("do_paint_jpeg: nvjpeg_decoder=%s", self.nvjpeg_decoder)
if self.nvjpeg_decoder and not alpha_offset:
img = self.nvjpeg_decoder.decompress("RGB", img_data)
with self.assign_cuda_context(False):
img = self.nvjpeg_decoder.decompress_with_device("RGB", img_data, download=self.nvjpeg_decoder.download_from_gpu)
else:
img = self.jpeg_decoder.decompress_to_rgb(rgb_format, img_data, alpha_offset)
rgb_format = img.get_pixel_format()
Expand Down
31 changes: 20 additions & 11 deletions xpra/codecs/cuda_common/cuda_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,24 +451,33 @@ def __bool__(self):
def __enter__(self):
assert self.lock.acquire(False), "failed to acquire cuda device lock"
if not self.context:
start = monotonic()
cf = driver.ctx_flags
if self.opengl:
from pycuda import gl
self.context = gl.make_context(self.device)
else:
self.context = self.device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
end = monotonic()
self.context.pop()
log("cuda context allocation took %ims", 1000*(end-start))
self.make_context()
return self.push_context()

def make_context(self):
start = monotonic()
cf = driver.ctx_flags
if self.opengl:
from pycuda import gl
self.context = gl.make_context(self.device)
else:
self.context = self.device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
end = monotonic()
self.context.pop()
log("cuda context allocation took %ims", 1000*(end-start))

def push_context(self):
self.context.push()
return self.context

def __exit__(self, exc_type, exc_val, exc_tb):
self.pop_context()
self.lock.release()

def pop_context(self):
c = self.context
if c:
c.pop()
self.lock.release()
#except driver.LogicError as e:
#log.warn("Warning: PyCUDA %s", e)
#self.clean()
Expand Down
103 changes: 53 additions & 50 deletions xpra/codecs/nvjpeg/decoder.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,24 @@ class NVJPEG_Exception(Exception):
pass



def download_from_gpu(buf, size):
start = monotonic()
pixels = bytearray(size)
driver.memcpy_dtoh(pixels, buf)
end = monotonic()
log("nvjpeg downloaded %i bytes in %ims", size, 1000*(end-start))
return pixels

def decompress(rgb_format, img_data, options=None):
#decompress using the default device:
def download_buffer(buf, size):
start = monotonic()
pixels = bytearray(size)
driver.memcpy_dtoh(pixels, buf)
end = monotonic()
log("nvjpeg downloaded %i bytes in %ims", size, 1000*(end-start))
return pixels
return decompress_with_device(default_device, rgb_format, img_data, options, download_buffer)

def decompress_with_device(device, rgb_format, img_data, options=None, download=None):
log("decompress_with_device(%s, %s, %i bytes, %s)", device, rgb_format, len(img_data), options)
#decompress using the default device,
#and download the pixel data from the GPU:
with default_device as cuda_context:
log("cuda_context=%s for device=%s", cuda_context, default_device.get_info())
return decompress_with_device(rgb_format, img_data, options, download_from_gpu)

def decompress_with_device(rgb_format, img_data, options=None, download=None):
log("decompress_with_device(%s, %i bytes, %s)", rgb_format, len(img_data), options)
cdef double start, end
cdef nvjpegHandle_t nv_handle
cdef nvjpegJpegState_t jpeg_handle
Expand All @@ -106,46 +111,44 @@ def decompress_with_device(device, rgb_format, img_data, options=None, download=

buf = None
pixels = None
with device as cuda_context:
log("cuda_context=%s for device=%s", cuda_context, device.get_info())
try:
errcheck(nvjpegCreateSimple(&nv_handle), "nvjpegCreateSimple")
try:
errcheck(nvjpegCreateSimple(&nv_handle), "nvjpegCreateSimple")
try:
errcheck(nvjpegJpegStateCreate(nv_handle, &jpeg_handle), "nvjpegJpegStateCreate")
with buffer_context(img_data) as bc:
data_len = len(bc)
data_buf = <const unsigned char*> (<uintptr_t> int(bc))
errcheck(nvjpegGetImageInfo(nv_handle, data_buf, data_len,
nComponents, &subsampling, widths, heights),
"nvjpegGetImageInfo")
log("got image info: %4ix%-4i YUV%s", widths[0], heights[0], CSS_STR.get(subsampling, subsampling))
width = widths[0]
height = heights[0]
rowstride = width*3
for i in range(1, NVJPEG_MAX_COMPONENT):
nv_image.channel[i] = NULL
nv_image.pitch[i] = 0
nv_image.pitch[0] = rowstride
buf = driver.mem_alloc(rowstride*height)
dmem = <uintptr_t> int(buf)
nv_image.channel[0] = <unsigned char *> dmem
start = monotonic()
with nogil:
r = nvjpegDecode(nv_handle, jpeg_handle,
data_buf, data_len,
output_format,
&nv_image,
nv_stream)
if r:
raise NVJPEG_Exception("decoding failed: %s" % ERR_STR.get(r, r))
end = monotonic()
log("nvjpegDecode took %ims", 1000*(end-start))
if download:
pixels = download(buf, rowstride*height)
finally:
errcheck(nvjpegJpegStateDestroy(jpeg_handle), "nvjpegJpegStateDestroy")
errcheck(nvjpegJpegStateCreate(nv_handle, &jpeg_handle), "nvjpegJpegStateCreate")
with buffer_context(img_data) as bc:
data_len = len(bc)
data_buf = <const unsigned char*> (<uintptr_t> int(bc))
errcheck(nvjpegGetImageInfo(nv_handle, data_buf, data_len,
nComponents, &subsampling, widths, heights),
"nvjpegGetImageInfo")
log("got image info: %4ix%-4i YUV%s", widths[0], heights[0], CSS_STR.get(subsampling, subsampling))
width = widths[0]
height = heights[0]
rowstride = width*3
for i in range(1, NVJPEG_MAX_COMPONENT):
nv_image.channel[i] = NULL
nv_image.pitch[i] = 0
nv_image.pitch[0] = rowstride
buf = driver.mem_alloc(rowstride*height)
dmem = <uintptr_t> int(buf)
nv_image.channel[0] = <unsigned char *> dmem
start = monotonic()
with nogil:
r = nvjpegDecode(nv_handle, jpeg_handle,
data_buf, data_len,
output_format,
&nv_image,
nv_stream)
if r:
raise NVJPEG_Exception("decoding failed: %s" % ERR_STR.get(r, r))
end = monotonic()
log("nvjpegDecode took %ims", 1000*(end-start))
if download:
pixels = download(buf, rowstride*height)
finally:
errcheck(nvjpegDestroy(nv_handle), "nvjpegDestroy")
errcheck(nvjpegJpegStateDestroy(jpeg_handle), "nvjpegJpegStateDestroy")
finally:
errcheck(nvjpegDestroy(nv_handle), "nvjpegDestroy")
return ImageWrapper(0, 0, width, height, pixels, rgb_format, 24, rowstride, planes=ImageWrapper.PACKED)


Expand Down

0 comments on commit 2f1c3ab

Please sign in to comment.