"""Python bindings for image processing CUDA functions"""
from typing import Tuple
from typing import List
class pybind11_object:
pass
class buffer:
"""Any object that supports the buffer interface, like bytearray or numpy.ndarray."""
pass
class capsule:
"""Python object that contains a reference to a C object."""
pass
class CuRandError(Exception):
pass
[docs]class DLDataTypeCode:
"""
:ref:`cpp/tensor:dlpack` type code enum.
Members:
kDLInt :
Signed integer.
kDLUInt :
Unsigned integer.
kDLFloat :
Floating point number.
"""
pass
[docs] def __init__(self: 'DLDataTypeCode', arg0: int) -> None:
"""__init__(self: augpy._augpy.'DLDataTypeCode', arg0: int) -> None
"""
pass
@property
def kDLInt(self):
"""
:ref:`cpp/tensor:dlpack` type code enum.
Members:
kDLInt :
Signed integer.
kDLUInt :
Unsigned integer.
kDLFloat :
Floating point number.
"""
pass
@property
def kDLUInt(self):
"""
:ref:`cpp/tensor:dlpack` type code enum.
Members:
kDLInt :
Signed integer.
kDLUInt :
Unsigned integer.
kDLFloat :
Floating point number.
"""
pass
@property
def kDLFloat(self):
"""
:ref:`cpp/tensor:dlpack` type code enum.
Members:
kDLInt :
Signed integer.
kDLUInt :
Unsigned integer.
kDLFloat :
Floating point number.
"""
pass
kDLInt = DLDataTypeCode(0)
kDLFloat = DLDataTypeCode(2)
[docs]class DLDataType(pybind11_object):
"""
:ref:`cpp/tensor:dlpack` data type for :py:class:`CudaTensors <CudaTensor>`.
Parameters:
code: See :py:class:`DLDataTypeCode`
bits: Number of bits
lanes: Number of elements for vector types;
must be 1 to use with :py:class:`CudaTensor`
"""
pass
[docs] def __init__(self: 'DLDataType', code: int, bits: int, lanes: int = 1) -> None:
"""__init__(self: augpy._augpy.'DLDataType', code: int, bits: int, lanes: int = 1) -> None
"""
pass
@property
def bits(self):
"""
Number of bits.
"""
pass
@property
def code(self):
"""
See :py:class:`'DLDataType'Code`.
"""
pass
@property
def itemsize(self):
"""
Number of bytes per element with this data type.
"""
pass
@property
def lanes(self):
"""
Mumber of elements for vector types.
Must be 1 to use with :py:class:`CudaTensor`.
"""
pass
kDLUInt = DLDataTypeCode(1)
[docs]class CudaDevice(pybind11_object):
"""
Create a new CudaDevice with the given Cuda device ID.
0 is the default and typically fastest device in the system.
Parameters:
device_id: GPU device ID
"""
pass
[docs] def __init__(self: 'CudaDevice', device_id: int) -> None:
"""__init__(self: augpy._augpy.'CudaDevice', device_id: int) -> None
"""
pass
[docs] def activate(self: 'CudaDevice') -> None:
"""activate(self: augpy._augpy.'CudaDevice') -> None
Make this the :ref:`py/core:current_stream`
and remember the previous stream.
"""
pass
[docs] def deactivate(self: 'CudaDevice') -> None:
"""deactivate(self: augpy._augpy.'CudaDevice') -> None
Make the previous stream the :ref:`py/core:current_stream`.
"""
pass
[docs] def get_device(self: 'CudaDevice') -> int:
"""get_device(self: augpy._augpy.'CudaDevice') -> int
Return the device ID.
"""
pass
[docs] def get_properties(self: 'CudaDevice') -> 'CudaDeviceProp':
"""get_properties(self: augpy._augpy.'CudaDevice') -> augpy._augpy.'CudaDevice'Prop
Return the device properties,
see :ref:`py/core:get_device_properties` for more detials.
"""
pass
[docs] def synchronize(self: 'CudaDevice') -> None:
"""synchronize(self: augpy._augpy.'CudaDevice') -> None
Block until all work on this device has finished.
Cuda uses busy waiting to achieve this.
See synchronization method of
:ref:`py/core:CudaStream` or :ref:`py/core:CudaEvent`
to avoid the CPU load this incurs.
"""
pass
[docs]class CudaDeviceProp(pybind11_object):
"""
The `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaDeviceProp.html>`_
struct extended with stream priority fields
:py:attr:`leastStreamPriority` and :py:attr:`greatestStreamPriority`,
:py:attr:`coresPerMultiprocessor`, and :py:attr:`maxGridSize`.
"""
pass
[docs] def __init__(self):
"""Initialize self. See help(type(self)) for accurate signature."""
pass
@property
def coresPerMultiprocessor(self):
"""
Number of Cuda cores per multiprocessor
"""
pass
@property
def coresPerSM(self):
"""
Number of Cuda cores per SM.
"""
pass
@property
def greatestStreamPriority(self):
"""
Highest priority a Cuda stream on this device can have.
"""
pass
@property
def l2CacheSize(self):
"""
Size of L2 cache in bytes
"""
pass
@property
def leastStreamPriority(self):
"""
Lowest priority a Cuda stream on this device can have.
"""
pass
@property
def major(self):
"""
Major compute capability
"""
pass
@property
def maxGridSize(self):
"""
Max number of blocks in each grid dimension
"""
pass
@property
def maxThreadsDim(self):
"""
Maximum size of each dimension of a block
"""
pass
@property
def maxThreadsPerBlock(self):
"""
Maximum number of threads per block
"""
pass
@property
def maxThreadsPerMultiProcessor(self):
"""
Maximum resident threads per multiprocessor
"""
pass
@property
def minor(self):
"""
Minor compute capability
"""
pass
@property
def multiProcessorCount(self):
"""
Number of multiprocessors on device
"""
pass
@property
def name(self):
"""
ASCII string identifying device
"""
pass
@property
def numCudaCores(self):
"""
Total number of Cuda coes.
"""
pass
@property
def regsPerBlock(self):
"""
32-bit registers available per block
"""
pass
@property
def regsPerMultiprocessor(self):
"""
32-bit registers available per multiprocessor
"""
pass
@property
def sharedMemPerBlock(self):
"""
Shared memory available per block in bytes
"""
pass
@property
def sharedMemPerMultiprocessor(self):
"""
Shared memory available per multiprocessor in bytes
"""
pass
@property
def streamPrioritiesSupported(self):
"""
Device supports stream priorities
"""
pass
@property
def totalConstMem(self):
"""
Constant memory available on device in bytes
"""
pass
@property
def totalGlobalMem(self):
"""
Global memory available on device in bytes
"""
pass
@property
def warpSize(self):
"""
Warp size in threads
"""
pass
class CudaError(Exception):
pass
[docs]class CudaEvent(pybind11_object):
"""
Convenience wrapper for the
`cudaEvent_t <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html>`_.
Creating a new CudaEvent retrieves an event from the event pool of the
:ref:`py/core:current_device`.
"""
pass
[docs] def __init__(self: 'CudaEvent') -> None:
"""__init__(self: augpy._augpy.'CudaEvent') -> None
"""
pass
[docs] def query(self: 'CudaEvent') -> bool:
"""query(self: augpy._augpy.'CudaEvent') -> bool
Returns ``True`` if event has occurred.
"""
pass
[docs] def record(self: 'CudaEvent') -> None:
"""record(self: augpy._augpy.'CudaEvent') -> None
Record wrapped event on :ref:`py/core:current_stream`.
"""
pass
[docs] def synchronize(self: 'CudaEvent', microseconds: int = 100) -> None:
"""synchronize(self: augpy._augpy.'CudaEvent', microseconds: int = 100) -> None
Block until event has occurred.
Checks in ``microseconds`` interval.
Faster intervals make this more accurate, but increase CPU load.
Uses standard Cuda busy-waiting method if ``microseconds <= 0``.
Parameters:
microseconds: check interval
"""
pass
[docs]class CudaStream(pybind11_object):
"""
Convenience wrapper for the
`cudaStream_t <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html>`_
type.
Creates a new Cuda stream on the given device.
Lower numbers mean higher priority,
and values are clipped to the valid range.
Use :py:func:`get_device_properties`
to get the range of possible values for a device.
See:
`cudaStreamCreateWithPriority <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1ge2be9e9858849bf62ba4a8b66d1c3540>`_
Use ``device_id=-1`` and ``priority=-1`` to get the
:py:attr:`default_stream`.
Parameters:
device_id: GPU device ID
priority: stream priority
"""
pass
[docs] def __init__(self: 'CudaStream', device_id: int = 0, priority: int = 0) -> None:
"""__init__(self: augpy._augpy.'CudaStream', device_id: int = 0, priority: int = 0) -> None
"""
pass
[docs] def activate(self: 'CudaStream') -> None:
"""activate(self: augpy._augpy.'CudaStream') -> None
Make this the :ref:`py/core:current_stream`
and remember the previous stream.
"""
pass
[docs] def deactivate(self: 'CudaStream') -> None:
"""deactivate(self: augpy._augpy.'CudaStream') -> None
Make the previous stream the :ref:`py/core:current_stream`.
"""
pass
[docs] def synchronize(self: 'CudaStream', microseconds: int = 100) -> None:
"""synchronize(self: augpy._augpy.'CudaStream', microseconds: int = 100) -> None
Block until all work on this stream has finished.
Checks in ``microseconds`` interval.
Faster intervals make this more accurate, but increase CPU load.
Uses standard Cuda busy-waiting method if ``microseconds <= 0``.
"""
pass
[docs]class CudaTensor(pybind11_object):
"""
Create a new, empty tensor on a GPU device.
Parameters:
shape: shape of the tensor
dtype: data type
device_id: Cuda device id
"""
pass
[docs] def __init__(self: 'CudaTensor', shape: List[int], dtype: DLDataType = DLDataType(code=kDLUInt, bits=8), device_id: int = 0) -> None:
"""__init__(self: augpy._augpy.'CudaTensor', shape: List[int], dtype: augpy._augpy.DLDataType = DLDataType(code=kDLUInt, bits=8), device_id: int = 0) -> None
"""
pass
@property
def byte_offset(self):
"""
Starting offset in bytes for the data pointer.
"""
pass
@property
def dtype(self):
"""
Tensor data type.
"""
pass
[docs] def fill(self, *args, **kwargs):
"""fill(*args, **kwargs)
Overloaded function.
1. fill(self: augpy._augpy.'CudaTensor', scalar: float) -> augpy._augpy.'CudaTensor'
Fill the tensor with the given scalar value.
:returns: this tensor
2. fill(self: augpy._augpy.'CudaTensor', other: augpy._augpy.'CudaTensor') -> augpy._augpy.'CudaTensor'
Copy the given tensor into this tensor.
:returns: this tensor
"""
pass
@property
def is_contiguous(self):
"""
``True`` if the tensor is contiguous, i.e.,
elements are located next to each other in memory.
"""
pass
@property
def itemsize(self):
"""
Size of the one element in bytes.
"""
pass
@property
def ndim(self):
"""
Number of dimensions.
"""
pass
[docs] def numpy(self, *args, **kwargs):
"""numpy(*args, **kwargs)
Overloaded function.
1. numpy(self: augpy._augpy.'CudaTensor') -> array
Create a new numpy array and start copying data from
the device to host memory.
2. numpy(self: augpy._augpy.'CudaTensor', array: buffer = None) -> array
Create a new numpy array from the given buffer and
start copying data from the device to host memory.
:param array: buffer to create new array from
"""
pass
@property
def ptr(self):
"""
Data pointer.
"""
pass
[docs] def reshape(self: 'CudaTensor', shape: List[int]) -> 'CudaTensor':
"""reshape(self: augpy._augpy.'CudaTensor', shape: List[int]) -> augpy._augpy.'CudaTensor'
Return a new tensor that uses the same backing memory
with a different shape. Shape must have same number
of elements. Only contiguous tensors can be reshaped.
Parameters:
shape: new shape
"""
pass
@property
def shape(self):
"""
Tensor shape.
"""
pass
@property
def size(self):
"""
Number of elements in the tensor.
"""
pass
@property
def strides(self):
"""
Tensor strides, i.e., the number of elements to add
to a flat tensor to reach the next element for each
dimension.
"""
pass
[docs] def sum(self, *args, **kwargs):
"""sum(*args, **kwargs)
Overloaded function.
1. sum(self: augpy._augpy.'CudaTensor', upcast: bool = False) -> augpy._augpy.'CudaTensor'
Sum all values in the tensor.
:param upcast: if ``True``, the output scalar tensor will
be promoted to a more expressive data type to avoid saturation
:returns: sum as scalar tensor
2. sum(self: augpy._augpy.'CudaTensor', axis: int, keepdim: bool = False, upcast: bool = False, out: augpy._augpy.'CudaTensor' = None, blocks_per_sm: int = 8, threads: int = 0) -> augpy._augpy.'CudaTensor'
Sum all values in the tensor along an axis.
:param axis: which axis to sum along
:param keepdim: keep the summed dimension with size 1
:param upcast: if ``True``, the output scalar tensor will
be promoted to a more expressive data type to avoid saturation
:param out: use this tensor as output, must have correct
shape, and same data type if ``upcast`` is ``False``,
otherwise promoted type is required
:returns: tensor summed along axis
"""
pass
class CutlassError(Exception):
pass
[docs]class Decoder(pybind11_object):
"""
Wrapper for Nvjpeg-based JPEG decoding,
created on the :ref:`py/core:current_device`.
See:
`Nvjpeg docs <https://docs.nvidia.com/cuda/nvjpeg/index.html#nvjpeg-set-device-mem-padding>`_
Parameters:
device_padding: memory padding on the device
host_padding: memory padding on the host
gpu_huffman: enable Huffman decoding on the GPU;
not recommended unless you really need
to offload from CPU
"""
pass
[docs] def __init__(self: 'Decoder', device_padding: int = 16777216, host_padding: int = 8388608, gpu_huffman: bool = False) -> None:
"""__init__(self: augpy._augpy.'Decoder', device_padding: int = 16777216, host_padding: int = 8388608, gpu_huffman: bool = False) -> None
"""
pass
[docs] def decode(self: 'Decoder', data: str, buffer: CudaTensor = None) -> CudaTensor:
"""decode(self: augpy._augpy.'Decoder', data: str, buffer: augpy._augpy.CudaTensor = None) -> augpy._augpy.CudaTensor
Decode a JPEG image using Nvjpeg.
Output is in :math:`(H,W,C)` format and resides on the GPU device.
Parameters:
data: compressed JPEG image as a JFIF string, i.e.,
the full file contents
buffer: optional buffer to use; may be ``None``;
if not ``None`` must be big enough to contain
the decoded image
Returns:
new tensor with decoded image on GPU in :math:`(H,W,C)` format
"""
pass
class MemoryError(Exception):
pass
class NvJpegError(Exception):
pass
[docs]class RandomNumberGenerator(pybind11_object):
"""
A convenient wrapper for cuRAND methods
that fill tensors with pseudo-random numbers.
Parameters:
device_id: GPU device ID;
if ``None``, :ref:`py/core:current_device` is used
seed: random seed;
if ``None``, read values from
`std::random_device <https://en.cppreference.com/w/cpp/numeric/random/random_device>`_
to create a random seed.
"""
pass
[docs] def __init__(self: 'RandomNumberGenerator', device_id: object = None, seed: object = None) -> None:
"""__init__(self: augpy._augpy.'RandomNumberGenerator', device_id: object = None, seed: object = None) -> None
"""
pass
[docs] def gaussian(self: 'RandomNumberGenerator', target: CudaTensor, mean: float = 0.0, std: float = 1.0, blocks_per_sm: int = 8, threads: int = 0) -> None:
"""gaussian(self: augpy._augpy.'RandomNumberGenerator', target: augpy._augpy.CudaTensor, mean: float = 0.0, std: float = 1.0, blocks_per_sm: int = 8, threads: int = 0) -> None
Fill ``target`` tensor with Gaussian distributed numbers
with specified ``mean`` and standard deviation ``std``.
.. note::
This is supported for integer tensors. Values are
drawn from the given distribution, then rounded and
cast to the data type of the tensor with saturation.
The values in an integer tensor are thus only
approximately Gaussian distributed.
Parameters:
target: tensor to fill
mean: Gaussian mean
std: Gaussian standard deviation
"""
pass
[docs]class WarpScaleMode:
"""
Enum whether to scale relative to the
shortest or longest side of the image.
Members:
WARP_SCALE_SHORTEST :
Scaling is relative to the shortest side of the image.
WARP_SCALE_LONGEST :
Scaling is relative to the longest side of the image.
"""
pass
[docs] def __init__(self: 'WarpScaleMode', arg0: int) -> None:
"""__init__(self: augpy._augpy.'WarpScaleMode', arg0: int) -> None
"""
pass
@property
def WARP_SCALE_SHORTEST(self):
"""
Enum whether to scale relative to the
shortest or longest side of the image.
Members:
WARP_SCALE_SHORTEST :
Scaling is relative to the shortest side of the image.
WARP_SCALE_LONGEST :
Scaling is relative to the longest side of the image.
"""
pass
@property
def WARP_SCALE_LONGEST(self):
"""
Enum whether to scale relative to the
shortest or longest side of the image.
Members:
WARP_SCALE_SHORTEST :
Scaling is relative to the shortest side of the image.
WARP_SCALE_LONGEST :
Scaling is relative to the longest side of the image.
"""
pass
WARP_SCALE_LONGEST = WarpScaleMode(1)
WARP_SCALE_SHORTEST = WarpScaleMode(0)
[docs]def add(*args, **kwargs):
"""add(*args, **kwargs)
Overloaded function.
1. add(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Add a ``scalar`` value to a ``tensor``.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. add(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Add ``tensor2`` to ``tensor1``.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
def all(tensor: CudaTensor) -> CudaTensor:
"""all(tensor: augpy._augpy.CudaTensor) -> augpy._augpy.CudaTensor
Check whether all elements in a tensor are greater zero.
Parameters:
tensor: tensor to sum, must be contiguous
Returns:
``0`` or ``1`` as scalar ``uint8`` tensor
"""
pass
[docs]def array_to_tensor(*args, **kwargs):
"""array_to_tensor(*args, **kwargs)
Overloaded function.
1. array_to_tensor(array: buffer, device_id: int = 0) -> augpy._augpy.CudaTensor
Copy a Python buffer into a new tensor on the specified GPU device.
This initiates an asynchronous copy from host to device memory.
2. array_to_tensor(array: buffer, tensor: augpy._augpy.CudaTensor) -> augpy._augpy.CudaTensor
Copy a Python buffer to a tensor created from the given buffer ``tensor``.
This initiates an asynchronous copy from host to device memory.
"""
pass
[docs]def box_blur_single(input: CudaTensor, ksize: int, out: CudaTensor = None) -> CudaTensor:
"""box_blur_single(input: augpy._augpy.CudaTensor, ksize: int, out: augpy._augpy.CudaTensor = None) -> augpy._augpy.CudaTensor
Apply box blur to a single image.
Kernel size describes both width and height in pixels
of the area in the input that is averaged for each
output pixel.
Odd values are recommended for best results.
For even values, the center of the kernel is below
and to the right of the true center.
This means the output is shifted up and left by half
a pixel.
Parameters:
input: image tensor in channel-first format
ksize: kernel size in pixels
out: output tensor (may be ``None``)
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def cast(*args, **kwargs):
"""cast(*args, **kwargs)
Overloaded function.
1. cast(tensor: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor, blocks_per_sm: int = 8, threads: int = 0) -> augpy._augpy.CudaTensor
Read values from ``tensor``, cast them to the data type of
``out`` and store them there.
``tensor`` and ``out`` must have the same shape.
:param tensor: source tensor
:param out: output tensor
2. cast(tensor: augpy._augpy.CudaTensor, dtype: augpy._augpy.DLDataType, blocks_per_sm: int = 8, threads: int = 0) -> augpy._augpy.CudaTensor
Create a new tensor with values from ``tensor``
cast to the given data type ``dtype``.
:param tensor: source tensor
:param dtype: target data type
:returns: new tensor with given data type
"""
pass
[docs]def copy(src: CudaTensor, dst: CudaTensor, blocks_per_sm: int = 8, threads: int = 0) -> CudaTensor:
"""copy(src: augpy._augpy.CudaTensor, dst: augpy._augpy.CudaTensor, blocks_per_sm: int = 8, threads: int = 0) -> augpy._augpy.CudaTensor
Copy ``src`` into ``dst``.
Supports broadcasting.
"""
pass
default_stream = CudaStream(device_id=-1, priority=-1)
def disable_profiler() -> None:
"""disable_profiler() -> None
Disable the Cuda profiler.
"""
pass
[docs]def div(*args, **kwargs):
"""div(*args, **kwargs)
Overloaded function.
1. div(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Divide a ``tensor`` by a ``scalar`` value.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. div(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Divide `tensor1` by `tensor2`.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def empty_like(tensor: CudaTensor) -> CudaTensor:
"""empty_like(tensor: augpy._augpy.CudaTensor) -> augpy._augpy.CudaTensor
Create a new tensor with the same shape,
dtype and on the same device as ``tensor``.
"""
pass
def enable_profiler() -> None:
"""enable_profiler() -> None
Enable the Cuda profiler.
"""
pass
[docs]def eq(*args, **kwargs):
"""eq(*args, **kwargs)
Overloaded function.
1. eq(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor == scalar`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. eq(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor1 == tensor2`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def export_dltensor(tensor: object, name: str = 'dltensor', destruct: bool = True) -> capsule:
"""export_dltensor(tensor: object, name: str = 'dltensor', destruct: bool = True) -> capsule
Export a GPU tensor to be used by another library.
Parameters:
pytensor: Python-wrapped CudaTensor
name: name under which the tensor is stored in the returned
:py:ref:`capsule <Capsules>`, e.g., `"dltensor"` for Pytorch
destruct: if ``True``, add a destructor to the
:py:ref:`capsule <Capsules>` which will delete the tensor
when the capsule is deleted; only set to ``False`` if you
know what you're doing
Returns:
:py:ref:`capsule <Capsules>` with exported :py:class:`CudaTensor`
"""
pass
[docs]def fill(scalar: float, dst: CudaTensor, blocks_per_sm: int = 8, threads: int = 0) -> CudaTensor:
"""fill(scalar: float, dst: augpy._augpy.CudaTensor, blocks_per_sm: int = 8, threads: int = 0) -> augpy._augpy.CudaTensor
Fill `src` with the given `scalar` value.
"""
pass
float16 = DLDataType(code=kDLFloat, bits=16)
float32 = DLDataType(code=kDLFloat, bits=32)
float64 = DLDataType(code=kDLFloat, bits=64)
[docs]def fma(scalar: float, tensor1: CudaTensor, tensor2: CudaTensor, out: CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> CudaTensor:
"""fma(scalar: float, tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute a fused multiply-add on a scalar and two tensors, i.e.,
.. math::
r = s \cdot t_1 \cdot t_2
If ``tensor1`` has an unsigned integer data type,
then ``tensor2`` must have the signed version of the same type,
e.g., a ``uint8`` tensor must be paired with a ``int8`` tensor.
Parameters:
scalar: scalar factor
tensor1: tensor :math:`t_1`
tensor2: tensor :math:`t_2`
out: optional output tensor :math:`r`
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def gaussian_blur(input: CudaTensor, sigmas: CudaTensor, max_ksize: int, out: CudaTensor = None) -> CudaTensor:
"""gaussian_blur(input: augpy._augpy.CudaTensor, sigmas: augpy._augpy.CudaTensor, max_ksize: int, out: augpy._augpy.CudaTensor = None) -> augpy._augpy.CudaTensor
Apply Gaussian blur to a batch of images.
Maximum kernel size can be calculated like this:
``ksize = max(3, int(max(sigmas) * 6.6 - 2.3) | 1)``
I.e., ``ksize`` is at least 3 and always odd.
The given kernel size defines the upper limit.
The actual kernel size is calculated with the
formula above and clipped at the given maximum.
Smaller values can be given to trade speed vs quality.
Bigger values typically do not visibly improve quality.
Odd values are strongly recommended for best results.
For even values, the center of the kernel is below
and to the right of the true center.
This means the output is shifted up and left by half
a pixel.
This can lead to inconsistencies between images
in the batch.
Images with large sigmas may be shifted, while smaller
sigmas mean no shift occurs.
Parameters:
input: batch tensor with images in first dimension
sigmas: float tensor with one sigma value per image in the batch
max_ksize: maximum kernel size in pixels
out: output tensor (may be ``None``)
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def gaussian_blur_single(input: CudaTensor, sigma: float, out: CudaTensor = None) -> CudaTensor:
"""gaussian_blur_single(input: augpy._augpy.CudaTensor, sigma: float, out: augpy._augpy.CudaTensor = None) -> augpy._augpy.CudaTensor
Apply Gaussian blur to a single image.
Kernel size is calculated like this:
``ksize = max(3, int(sigma * 6.6 - 2.3) | 1)``
I.e., ``ksize`` is at least 3 and always odd.
Parameters:
input: image tensor in channel-first format
sigma: standard deviation of the kernel
out: output tensor (may be ``None``)
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def ge(tensor: CudaTensor, scalar: float, out: CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> CudaTensor:
"""ge(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor >= scalar`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def gemm(A: CudaTensor, B: CudaTensor, C: CudaTensor = None, alpha: float = 1.0, beta: float = 0.0) -> CudaTensor:
"""gemm(A: augpy._augpy.CudaTensor, B: augpy._augpy.CudaTensor, C: augpy._augpy.CudaTensor = None, alpha: float = 1.0, beta: float = 0.0) -> augpy._augpy.CudaTensor
Calculate the matrix multiplication of two 2D tensors.
More specifically calculates
.. math::
C = A \times (\alpha \cdot B) + \beta \cdot C
Only ``float`` and ``double`` are supported.
All tensors must have the same data type.
All tensors must be contiguous.
Returns:
new output tensor if ``C`` is ``None``, otherwise ``C``
"""
pass
[docs]def get_current_device() -> int:
"""get_current_device() -> int
Returns the active device ID.
See:
:ref:`py/core:current_device`.
"""
pass
[docs]def get_current_stream() -> CudaStream:
"""get_current_stream() -> augpy._augpy.CudaStream
Returns the active :py:class:`CudaStream`.
See:
:ref:`py/core:current_stream`
"""
pass
[docs]def get_device_properties(device_id: int) -> CudaDeviceProp:
"""get_device_properties(device_id: int) -> augpy._augpy.CudaDeviceProp
Get :py:class:`CudaDeviceProp` for given device.
Parameters:
device_id: Cude device id
Returns:
CudaDeviceProp: properties of device
"""
pass
[docs]def gt(tensor: CudaTensor, scalar: float, out: CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> CudaTensor:
"""gt(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor > scalar`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def import_dltensor(tensor_capsule: capsule, name: str) -> CudaTensor:
"""import_dltensor(tensor_capsule: capsule, name: str) -> augpy._augpy.CudaTensor
Import a GPU tensor from another library into augpy.
Parameters:
tensor_capsule: a Python :py:ref:`capsule <Capsules>` object that contains
a :any:`DLManagedTensor`
name: name under which the tensor is stored in the
:py:ref:`capsule <Capsules>`, e.g., ``"dltensor"`` for Pytorch
Returns:
other tensor wrapped in a :py:class:`CudaTensor`
"""
pass
def init() -> None:
"""init() -> None
Set the `cudaDeviceScheduleYield
<https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g69e73c7dda3fc05306ae7c811a690fac>`_
flag for the :ref:`py/core:current_device`.
.. warning::
EXPERIMENTAL! MAY REDUCE GPU THROUGHPUT AND BREAK MANY THINGS!
"""
pass
int16 = DLDataType(code=kDLInt, bits=16)
int32 = DLDataType(code=kDLInt, bits=32)
int64 = DLDataType(code=kDLInt, bits=64)
int8 = DLDataType(code=kDLInt, bits=8)
[docs]def le(*args, **kwargs):
"""le(*args, **kwargs)
Overloaded function.
1. le(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor <= scalar`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. le(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor1 >= tensor2`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def lighting(imtensor: CudaTensor, gammagrays: CudaTensor, gammacolors: CudaTensor, contrasts: CudaTensor, vmin: float, vmax: float, out: CudaTensor = None) -> CudaTensor:
"""lighting(imtensor: augpy._augpy.CudaTensor, gammagrays: augpy._augpy.CudaTensor, gammacolors: augpy._augpy.CudaTensor, contrasts: augpy._augpy.CudaTensor, vmin: float, vmax: float, out: augpy._augpy.CudaTensor = None) -> augpy._augpy.CudaTensor
Apply lighting augmentation to a batch of images.
This is a four-step process:
#. Normalize values :math:`v_{norm} = \frac{v - v_{min}}{v_{max}-v_{min}}`
with :math:`v_{max}` the minimum and :math:`v_{max}` the maximum
lightness value
#. Apply contrast change
#. Apply gamma correction
#. Denormalize values :math:`v' = v_{norm} * (v_{max}-v_{min}) + v_{min}`
To change contrast two reference functions are used.
With contrast :math:`\mathcal{c} \ge 0`, i.e., increased contrast,
the following function is used:
.. math::
f_{pos}(v) =
\frac{1.0037575963899724}{1 + exp(6.279 + v \cdot 12.558)} - 0.0018787981949862
With contrast :math:`\mathcal{c} < 0`, i.e., decreased contrast,
the following function is used:
.. math::
f_{neg}(v) =
0.1755606108304832 \cdot atanh(v \cdot 1.986608 - 0.993304) + 0.5
The final value is
:math:`v' = (1-\mathcal{c}) \cdot v + \mathcal{c} \cdot f(v)`.
Brightness and color changes are done via gamma correction.
.. math::
v' = v^{\gamma_{gray} \cdot \gamma_c}
with :math:`\gamma_{gray}` the gamma for overall lightness and
:math:`\gamma_{c}` the per-channel gamma.
Parameters:
tensor: image tensor in :math:`(N,C,H,W)` format
gammagrays: tensor of :math:`N` gamma gray values
gammacolors: tensor of :math:`C\cdot N` gamma values in the format
:math:`\gamma_{1,1}, \gamma_{1,2}, ..., \gamma_{1,C},
\gamma_{2,1}, \gamma_{2,2}, ... \gamma_{N,C-1}, \gamma_{N,C}`
contrasts: tensor of :math:`N` contrast values in :math:`[-1, 1]`
vmin: minimum lightness value in images
vmax: maximum lightness value in images
out: output tensor (may be ``None``)
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def lt(*args, **kwargs):
"""lt(*args, **kwargs)
Overloaded function.
1. lt(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor < scalar`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. lt(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Compute ``tensor1 >= tensor2`` as ``uint8`` tensor,
where ``1`` means the condition is met and ``0`` otherwise.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def make_affine_matrix(out: buffer, source_height: int, source_width: int, target_height: int, target_width: int, angle: float = 0.0, scale: float = 1.0, aspect: float = 1.0, shifty: float = 0.0, shiftx: float = 0.0, sheary: float = 0.0, shearx: float = 0.0, hmirror: bool = False, vmirror: bool = False, scale_mode: WarpScaleMode = WarpScaleMode.WARP_SCALE_SHORTEST, max_supersampling: int = 3) -> int:
"""make_affine_matrix(out: buffer, source_height: int, source_width: int, target_height: int, target_width: int, angle: float = 0.0, scale: float = 1.0, aspect: float = 1.0, shifty: float = 0.0, shiftx: float = 0.0, sheary: float = 0.0, shearx: float = 0.0, hmirror: bool = False, vmirror: bool = False, scale_mode: augpy._augpy.WarpScaleMode = WarpScaleMode.WARP_SCALE_SHORTEST, max_supersampling: int = 3) -> int
Create a :math:`2 \times 3` matrix for a set of affine
transformations.
This matrix is compatible with the `warpAffine
<https://docs.opencv.org/3.4/da/d54/group__imgproc__transform.html#ga0203d9ee5fcd28d40dbc4a1ea4451983>`_
function of OpenCV with the `WARP_INVERSE_MAP
<https://docs.opencv.org/3.4/da/d54/group__imgproc__transform.html#gga5bb5a1fea74ea38e1a5445ca803ff121aa48be1c433186c4eae1ea86aa0ca75ba>`_
flag set.
Transforms are applied in the following order:
#. shear
#. scale & aspect ratio
#. horizontal & vertical mirror
#. rotation
#. horizontal & vertical shift
See:
:py:func:`make_transform` for a more convenient version of this function.
Parameters:
out: output buffer that matrix is written to;
must be a writeable :math:`2 \times 3` ``float`` buffer
source_height: :math:`h_s` height of the image in pixels
source_width: :math:`w_s` width of the image in pixels
target_height: :math:`h_t` height of the output canvas in pixels
target_width: :math:`w_t` width of the output canvas in pixels
angle: clockwise angle in degrees
with image center as rotation axis
scale: scale factor relative to output size;
1 means fill target height or width wise depending
on ``scale_mode`` and whichever is longest/shortest;
larger values will crop,
smaller values leave empty space in the output canvas
aspect: controls the aspect ratio;
1 means same as input, values greater 1
increase the width and reduce the height
shifty: shift the image in y direction (vertical);
0 centers the image on the output canvas;
-1 means shift up as much as possible;
1 means shfit down as much as possible;
the maximum distance to shift is
:math:`max(scale \cdot h_s - h_t, h_t - scale \cdot h_s)`
shiftx: same as ``shifty``, but in x direction (horizontal)
sheary: controls up/down shear;
for every pixel in the x direction move ``sheary`` pixels
in y direction
shearx: same as ``sheary`` but controls left/right shear
hmirror: if ``True`` flip image horizontally
vmirror: if ``True`` flip image vertically
scale_mode: if :py:attr:`WarpScaleMode.WARP_SCALE_SHORTEST` scale
is relative to shortest side;
this fills the output canvas, cropping the image
if necessary;
if :py:attr:`WarpScaleMode.WARP_SCALE_LONGEST` scale
is relative to longest side;
this ensures the image is contained inside the
output canvas, but leaves empty space
max_supersampling: upper limit for recommended supersampling
Returns:
recommended supersampling factor for the warp
"""
pass
[docs]def meminfo(device_id: int = 0) -> Tuple[int, int, int]:
"""meminfo(device_id: int = 0) -> Tuple[int, int, int]
For the device defined by ``device_id``,
return the current used, free, and total memory in bytes.
"""
pass
[docs]def mul(*args, **kwargs):
"""mul(*args, **kwargs)
Overloaded function.
1. mul(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Multiply a ``tensor`` by a ``scalar`` value.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. mul(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Multiply ``tensor1`` by ``tensor2``.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
def nvtx_range_end(end: int) -> None:
"""nvtx_range_end(end: int) -> None
Tell the Nvidia profiler to end the given `nvtx
<https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx>`_
range.
Parameters:
end: ID of the range to end
"""
pass
def nvtx_range_start(msg: str) -> int:
"""nvtx_range_start(msg: str) -> int
Tell the Nvidia profiler to start a new `nvtx
<https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx>`_
range.
Can be used to place marks in profiling output.
Parameters:
msg: Message attached to the range
Returns:
range ID to be used with :py:func:`nvtx_range_end`
"""
pass
[docs]def rdiv(tensor: CudaTensor, scalar: float, out: CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> CudaTensor:
"""rdiv(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Divide a ``scalar`` value by a ``tensor``.
Parameters:
tensor: tensor
scalar: scalar value
out: optional output tensor
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def release() -> None:
"""release() -> None
Release all allocated memory on all GPUs.
All :py:class:`CudaTensors <CudaTensor>` become invalid immediately.
Do I have to tell you this is dangerous?
"""
pass
[docs]def rsub(tensor: CudaTensor, scalar: float, out: CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> CudaTensor:
"""rsub(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Subtract a ``tensor`` from a ``scalar`` value.
Parameters:
tensor: tensor
scalar: scalar value
out: optional output tensor
Returns:
new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def sub(*args, **kwargs):
"""sub(*args, **kwargs)
Overloaded function.
1. sub(tensor: augpy._augpy.CudaTensor, scalar: float, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Subtract a ``scalar`` value from a ``tensor``.
:param tensor: tensor
:param scalar: scalar value
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
2. sub(tensor1: augpy._augpy.CudaTensor, tensor2: augpy._augpy.CudaTensor, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, threads: int = 512) -> augpy._augpy.CudaTensor
Subtract ``tensor2`` from ``tensor1``.
:param tensor1: first tensor
:param tensor2: second tensor
:param out: optional output tensor
:returns: new tensor if ``out`` is ``None``, else ``out``
"""
pass
[docs]def sum(*args, **kwargs):
"""sum(*args, **kwargs)
Overloaded function.
1. sum(tensor: augpy._augpy.CudaTensor, upcast: bool = False) -> augpy._augpy.CudaTensor
Sum all elements in a tensor with saturation.
:param tensor: tensor to sum, must be contiguous
:param upcast: if ``True``, returns tensor with
``float`` or ``double`` type
:returns: sum value as scalar tensor
2. sum(tensor: augpy._augpy.CudaTensor, axis: int, keepdim: bool = False, upcast: bool = False, out: augpy._augpy.CudaTensor = None, blocks_per_sm: int = 8, num_threads: int = 0) -> augpy._augpy.CudaTensor
Sum of all elements along an axis in a tensor with saturation.
:param tensor: tensor to sum, may be strided
:param axis: axis index to sum along
:param keepdim: if ``True``, keep sum axis dimension with length 1
:param upcast: if ``True``, returns tensor with
``float`` or ``double`` type
:param out: output tensor (may be ``None``)
:returns: tensor with values summed along axis,
or ``None`` if ``out`` is tensor
"""
pass
[docs]def tensor_to_array(*args, **kwargs):
"""tensor_to_array(*args, **kwargs)
Overloaded function.
1. tensor_to_array(tensor: augpy._augpy.CudaTensor) -> array
Copy a given tensor to a new numpy array.
This initiates an asynchronous copy from device to host memory.
2. tensor_to_array(tensor: augpy._augpy.CudaTensor, array: buffer) -> array
Copy a given tensor to a numpy array created from the given buffer ``array``.
This initiates an asynchronous copy from device to host memory.
"""
pass
uint16 = DLDataType(code=kDLUInt, bits=16)
uint32 = DLDataType(code=kDLUInt, bits=32)
uint64 = DLDataType(code=kDLUInt, bits=64)
uint8 = DLDataType(code=kDLUInt, bits=8)
[docs]def warp_affine(src: CudaTensor, dst: CudaTensor, matrix: buffer, background: CudaTensor, supersampling: int) -> None:
"""warp_affine(src: augpy._augpy.CudaTensor, dst: augpy._augpy.CudaTensor, matrix: buffer, background: augpy._augpy.CudaTensor, supersampling: int) -> None
Takes an image in channels-last format :math:`(H, W, C)`
and affine warps it into a given output tensor in
channels-first format :math:`(C, H, W)`.
Any blank canvas is filled with a background color.
The warp is performed with bi-linear and supersampling.
Parameters:
src: image tensor
dst: target tensor
matrix: :math:`2 \times 3` ``float`` transformation matrix,
see :py:func:`make_affine_matrix` for details
background: background color to fill empty canvas
supersampling: supersampling factor, e.g., 3 means
9 samples are taken in a :math:`3 \times 3` grid
"""
pass
all = [
'CuRandError',
'CudaDevice',
'CudaDeviceProp',
'CudaError',
'CudaEvent',
'CudaStream',
'CudaTensor',
'CutlassError',
'kDLInt',
'kDLFloat',
'DLDataTypeCode',
'DLDataType',
'kDLUInt',
'Decoder',
'MemoryError',
'NvJpegError',
'RandomNumberGenerator',
'WarpScaleMode',
'WARP_SCALE_LONGEST',
'WARP_SCALE_SHORTEST',
'add',
'all',
'array_to_tensor',
'box_blur_single',
'cast',
'copy',
'default_stream',
'disable_profiler',
'div',
'empty_like',
'enable_profiler',
'eq',
'export_dltensor',
'fill',
'float16',
'float32',
'float64',
'fma',
'gaussian_blur',
'gaussian_blur_single',
'ge',
'gemm',
'get_current_device',
'get_current_stream',
'get_device_properties',
'gt',
'import_dltensor',
'init',
'int16',
'int32',
'int64',
'int8',
'le',
'lighting',
'lt',
'make_affine_matrix',
'meminfo',
'mul',
'nvtx_range_end',
'nvtx_range_start',
'rdiv',
'release',
'rsub',
'sub',
'sum',
'tensor_to_array',
'uint16',
'uint32',
'uint64',
'uint8',
'warp_affine'
]