Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions loopy/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,8 @@ def map_constant(self, expr: object) -> bool:

def map_variable(self, expr: p.Variable) -> bool:
if expr.name == self.vec_iname:
# Technically, this is doable. But we're not going there.
raise UnvectorizableError()

# Technically, this is doable.
Comment thread
nkoskelo marked this conversation as resolved.
Outdated
return True
# A single variable is always a scalar.
Comment thread
nkoskelo marked this conversation as resolved.
return False

Expand Down
82 changes: 81 additions & 1 deletion loopy/target/opencl.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
from loopy.kernel import LoopKernel


# {{{ dtype registry wrappers
Expand Down Expand Up @@ -456,7 +457,8 @@ def get_opencl_callables():

# {{{ symbol mangler

def opencl_symbol_mangler(kernel, name):
def opencl_symbol_mangler(kernel: LoopKernel,
name: str) -> tuple[NumpyType, str] | None:
# FIXME: should be more picky about exact names
if name.startswith("FLT_"):
return NumpyType(np.dtype(np.float32)), name
Expand Down Expand Up @@ -545,6 +547,21 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
from pymbolic.primitives import Comparison
return Comparison(s, "!=", 0)

if needed_dtype == actual_type:
return s

registry = self.codegen_state.ast_builder.target.get_dtype_registry()
if self.codegen_state.target.is_vector_dtype(needed_dtype):
# OpenCL does not let you do explicit vector type casts between vector
# types. Instead you need to call their function which is of the form
# <desttype> convert_<desttype><n>(src) where n
# is the number of elements in the vector which is the same as in src.
# https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
if self.codegen_state.target.is_vector_dtype(actual_type) or \
actual_type.dtype.kind == "b":
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this bool handling doing here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type, actual_type is computed using the expression before the vector literal is inserted. I think the better solution would be to update the type inference system to use the vectorized version of the expression instead.

cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
return cast(s)

return super().wrap_in_typecast(actual_type, needed_dtype, s)

def map_group_hw_index(self, expr, type_context):
Expand All @@ -553,6 +570,69 @@ def map_group_hw_index(self, expr, type_context):
def map_local_hw_index(self, expr, type_context):
return var("lid")(expr.axis)

def map_variable(self, expr, type_context):

if self.codegen_state.vectorization_info:
if self.codegen_state.vectorization_info.iname == expr.name:
# This needs to be converted into a vector literal.
from loopy.symbolic import Literal
vector_length = self.codegen_state.vectorization_info.length
index_type = self.codegen_state.kernel.index_dtype
vector_type = self.codegen_state.target.vector_dtype(index_type,
vector_length)
typecast = self.codegen_state.target.dtype_to_typename(vector_type)
Comment thread
nkoskelo marked this conversation as resolved.
Outdated
vector_literal = f"(({typecast})" + " (" + \
",".join([f"{i}" for i in range(vector_length)]) + "))"
return Literal(vector_literal)
return super().map_variable(expr, type_context)

def map_if(self, expr, type_context):
from loopy.types import to_loopy_type
result_type = self.infer_type(expr)
conditional_needed_loopy_type = to_loopy_type(np.bool_)
if self.codegen_state.vectorization_info:
from loopy.codegen import UnvectorizableError
from loopy.expression import VectorizabilityChecker
checker = VectorizabilityChecker(self.codegen_state.kernel,
self.codegen_state.vectorization_info.iname,
self.codegen_state.vectorization_info.length)

try:
is_vector = checker(expr)

if is_vector:
"""
We could have a vector literal here which may need to be
converted to an appropriate size. The OpenCL specification states
that for ( c ? a : b) a, b, and c must have the same
number of elements and bits and that c must be an integral type.
https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#table-builtin-relational
"""
index_type = to_loopy_type(self.codegen_state.kernel.index_dtype)
types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
length = self.codegen_state.vectorization_info.length
if index_type.itemsize != result_type.itemsize and \
result_type.itemsize in types.keys():
Comment thread
nkoskelo marked this conversation as resolved.
Outdated
# Need to convert index type into result type size.
# Item size is measured in bytes.
index_type = types[result_type.itemsize]
elif index_type.itemsize * length != result_type.itemsize and \
(result_type.itemsize // length) in types.keys():
Comment thread
nkoskelo marked this conversation as resolved.
Outdated

index_type = types[result_type.itemsize // length]
vector_type = self.codegen_state.target.vector_dtype(index_type,
length)
conditional_needed_loopy_type = to_loopy_type(vector_type)
except UnvectorizableError:
Comment thread
nkoskelo marked this conversation as resolved.
Outdated
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we suppressing exceptions here in the first place?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We know that VectorizabilityChecker has succeeded at least once before. However, it is unclear that it will pass for the current expression which may be for a different part of the code. So out of caution, I thought it would be best to rerun the VectorizabilityChecker. If the VectorizabilityChecker succeeds then we need to ensure the proper typing of the vector conditional. If it fails, then the expression is not a vector and so we just handle the case like normal.

Comment thread
nkoskelo marked this conversation as resolved.
Outdated
pass

return type(expr)(
self.rec(expr.condition, type_context,
conditional_needed_loopy_type),
self.rec(expr.then, type_context, result_type),
self.rec(expr.else_, type_context, result_type),
)
# }}}


Expand Down
30 changes: 30 additions & 0 deletions test/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,36 @@ def test_float3():
assert "float3" in device_code


def test_cl_vectorize_index_variable(ctx_factory):
knl = lp.make_kernel(
"{ [i]: 0<=i<n }",
"""
b[i] = a[i]*3 if i < 32 else sin(a[i])
""")

knl = lp.split_array_axis(knl, "a,b", 0, 4)
knl = lp.split_iname(knl, "i", 4)
knl = lp.tag_inames(knl, {"i_inner": "vec"})
knl = lp.tag_array_axes(knl, "a,b", "c,vec")
knl = lp.set_options(knl, write_code=True)
knl = lp.assume(knl, "n % 4 = 0 and n>0")

rng = np.random.default_rng(seed=12)
a = rng.normal(size=(16, 4))
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64})
_evt, (result,) = knl(queue, a=a, n=a.size)

result_ref = np.zeros(a.shape, dtype=np.float64)
for i in range(16):
for j in range(4):
ind = i*4 + j
result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j])
Comment thread
nkoskelo marked this conversation as resolved.
Outdated

assert np.allclose(result, result_ref)


if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
Expand Down