Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add aten::_to_sparse_csc() and aten::_to_sparse_csr #1313

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
8 changes: 6 additions & 2 deletions cmake/Codegen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)

if(WIN32)
Expand Down Expand Up @@ -49,6 +50,7 @@ endfunction(GEN_BACKEND)
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
function(GEN_XPU file_yaml)
Expand Down Expand Up @@ -79,7 +81,7 @@ function(GEN_XPU file_yaml)
--install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
--per-operator-headers
--static-dispatch-backend
--backend-whitelist XPU SparseXPU NestedTensorXPU
--backend-whitelist XPU SparseXPU NestedTensorXPU SparseCsrXPU
# --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs
--xpu
# --update-aoti-c-shim: generate extend/c_shim_xpu.h
Expand All @@ -96,6 +98,7 @@ function(GEN_XPU file_yaml)
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH}
${SIMPLE_TRACE}
WORKING_DIRECTORY ${TORCH_ROOT}
DEPENDS
Expand Down Expand Up @@ -126,6 +129,7 @@ GEN_XPU(
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
)
Expand All @@ -137,7 +141,7 @@ GEN_XPU(
# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)

list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH})
list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH} ${RegisterSparseCsrXPU_PATH})
list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
set(ATen_XPU_GEN_SRCS ${xpu_generated_src})
34 changes: 34 additions & 0 deletions src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#pragma once
#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>
#include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>

namespace at::native{

TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_xpu)(
const Tensor& input,

Check failure on line 9 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
const int64_t size,

Check failure on line 10 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
const bool out_int32,

Check failure on line 11 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
const Tensor& result){
xpu::convert_indices_from_coo_to_csr_structured_kernel(
input,

Check failure on line 14 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
size,

Check failure on line 15 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
out_int32,

Check failure on line 16 in src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
result);
};

TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_xpu)(
const Tensor& crow_indices,
const Tensor& col_indices,
const bool out_int32,
const bool transpose,
const Tensor& result){
xpu::convert_indices_from_csr_to_coo_structured_kernel(
crow_indices,
col_indices,
out_int32,
transpose,
result);
};

} // namespace at::native
199 changes: 199 additions & 0 deletions src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS

Check failure on line 1 in src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

NEWLINE Trailing newline

Trailing newline found. Run `lintrunner --take NEWLINE -a` to apply changes.
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/InitialTensorOptions.h>
#include <ATen/SparseCsrTensorImpl.h>
#include <ATen/SparseCsrTensorUtils.h>
#include <ATen/WrapDimUtilsMulti.h>
#include <ATen/native/BinaryOps.h>
#include <ATen/native/Resize.h>
#include <ATen/native/SparseTensorUtils.h>
#include <algorithm>
#include <ATen/AccumulateType.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
#include <ATen/ops/_unique.h>
#include <ATen/ops/add_native.h>
#include <ATen/ops/resize_as_sparse_native.h>
#include <ATen/ops/tensor.h>
#include <ATen/ops/zeros.h>
#endif

#include <ATen/native/xpu/sycl/Loops.h>
#include <ATen/native/xpu/sycl/pstl/PSTLFunctions.h>
#include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
#include <comm/SYCLContext.h>

namespace at::native::xpu{

template <typename input_t, typename output_t>
struct convertIndicesFromCooToCsrXPUFunctor{
void operator()(sycl::nd_item<1> itemId) const {
auto linear_id = itemId.get_global_linear_id();
if (linear_id == 0) {
for (int64_t i = 0; i <= data_in[0]; i++)
data_out[i] = static_cast<output_t>(0);
} else if (linear_id < numel) {
for (int64_t i = data_in[linear_id - 1]; i < data_in[linear_id]; i++)
data_out[i + 1] = static_cast<output_t>(linear_id);
} else if (linear_id == numel) {
for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++)
data_out[i] = static_cast<output_t>(numel);
}
}
convertIndicesFromCooToCsrXPUFunctor(
int64_t numel_,
const input_t* data_in_,
output_t* data_out_,
const int64_t size_)
: numel(numel_), data_in(data_in_), data_out(data_out_), size(size_) {}

private:
int64_t numel;
const input_t* data_in;
output_t* data_out;
const int64_t size;
};

template <typename input_t, typename output_t>
struct convertIndicesFromCsrToCooXPUFunctor {
void operator()(sycl::nd_item<1> itemId) const {
int64_t linear_id = itemId.get_global_linear_id();
if (linear_id < nrows) {
for (int64_t i = crow_indices_data_in[linear_id];
i < crow_indices_data_in[linear_id + 1];
i++)
data_out[i] = static_cast<output_t>(linear_id);
}
}
convertIndicesFromCsrToCooXPUFunctor(
int64_t nrows_,
const input_t* crow_indices_data_in_,
output_t* data_out_)
: nrows(nrows_),
crow_indices_data_in(crow_indices_data_in_),
data_out(data_out_) {}

private:
int64_t nrows;
const input_t* crow_indices_data_in;
output_t* data_out;
};

template <typename input_t, typename output_t>
void launch_convert_indices_from_coo_to_csr_xpu_kernel(
const Tensor& result,
const Tensor& input,
const int64_t size){

int64_t numel = input.numel();
if (numel == 0) {
result.zero_();
return;
}

const input_t* data_in = input.const_data_ptr<input_t>();
output_t* data_out = result.data_ptr<output_t>();

int64_t wgroup_size = 64;
int64_t ngroups = (numel + wgroup_size - 1) / wgroup_size;
sycl::range<1> global_range(ngroups * wgroup_size);
sycl::range<1> local_range(wgroup_size);

auto functor = convertIndicesFromCooToCsrXPUFunctor<input_t, output_t>(
numel,
data_in,
data_out,
size);

sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), functor);
}


template <typename input_t, typename output_t>
void launch_convert_indices_from_csr_to_coo_xpu_kernel(
const Tensor& indices,
const Tensor& crow_indices,
const Tensor& col_indices,
const bool transpose = false) {
int64_t nrows = crow_indices.numel() - 1;

if (nrows == 0) {
indices.zero_();
return;
}

auto crow_indices_ = crow_indices.expect_contiguous();
const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
TORCH_INTERNAL_ASSERT(indices.is_contiguous());
auto row0 = indices.select(0, transpose ? 1 : 0);
auto row1 = indices.select(0, transpose ? 0 : 1);
output_t* data_out = row0.data_ptr<output_t>();
row1.copy_(*col_indices.expect_contiguous());

int64_t wgroup_size = 64;
int64_t ngroups = (nrows + wgroup_size - 1) / wgroup_size;
sycl::range<1> global_range(ngroups * wgroup_size);
sycl::range<1> local_range(wgroup_size);

auto functor = convertIndicesFromCsrToCooXPUFunctor<input_t, output_t>(
nrows,
crow_indices_data_in,
data_out);

sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), functor);
}

void convert_indices_from_coo_to_csr_structured_kernel(
const Tensor& input,
const int64_t size,
const bool out_int32,
const Tensor& result){

Check failure on line 156 in src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.
if (out_int32){
AT_DISPATCH_INTEGRAL_TYPES(
input.scalar_type(), "convert_indices_from_coo_to_csr_xpu", [&] {
launch_convert_indices_from_coo_to_csr_xpu_kernel<scalar_t, int>(
result, input, size);
});
} else {
AT_DISPATCH_INTEGRAL_TYPES(
input.scalar_type(), "convert_indices_from_coo_to_csr_xpu", [&] {
launch_convert_indices_from_coo_to_csr_xpu_kernel<scalar_t, int64_t>(
result, input, size);
});
}
}

void convert_indices_from_csr_to_coo_structured_kernel(
const Tensor& crow_indices,
const Tensor& col_indices,
const bool out_int32,
const bool transpose,
const Tensor& result) {
if (out_int32) {
AT_DISPATCH_INTEGRAL_TYPES(
crow_indices.scalar_type(),
"convert_indices_from_csr_to_coo_xpu",
[&] {
launch_convert_indices_from_csr_to_coo_xpu_kernel<scalar_t, int>(
result, crow_indices, col_indices, transpose);
});
} else {
AT_DISPATCH_INTEGRAL_TYPES(
crow_indices.scalar_type(),
"convert_indices_from_coo_to_csr_xpu",
[&] {
launch_convert_indices_from_csr_to_coo_xpu_kernel<scalar_t, int64_t>(
result, crow_indices, col_indices, transpose);
});
}
}
} // namespace at::native::xpu



20 changes: 20 additions & 0 deletions src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <ATen/native/SparseTensorUtils.h>
#include <ATen/native/TensorIterator.h>

namespace at::native::xpu {

TORCH_XPU_API void convert_indices_from_coo_to_csr_structured_kernel(
const Tensor& input,
const int64_t size,
const bool out_int32,
const Tensor& result);

TORCH_XPU_API void convert_indices_from_csr_to_coo_structured_kernel(
const Tensor& crow_indices,
const Tensor& col_indices,
const bool out_int32,
const bool transpose,
const Tensor& result);
} // namespace at::native::xpu
1 change: 0 additions & 1 deletion src/ATen/native/xpu/XPUFallback.template
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"lu_unpack.out",
"ormqr",
"_scaled_mm",
"_to_sparse_csr",
"triangular_solve.X",
"_validate_compressed_sparse_indices",
"vdot",
Expand Down
13 changes: 13 additions & 0 deletions test/xpu/skip_list_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@
"test_errors_sparse_mul_layout3_xpu",
"test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
"test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",
# Similar error as above for to_sparse_csr() operator
"test_sparse_csr_from_dense_xpu_bfloat16",
"test_sparse_csr_from_dense_xpu_bool",
"test_sparse_csr_from_dense_xpu_complex128",
"test_sparse_csr_from_dense_xpu_complex64",
"test_sparse_csr_from_dense_xpu_float16",
"test_sparse_csr_from_dense_xpu_float32",
"test_sparse_csr_from_dense_xpu_float64",
"test_sparse_csr_from_dense_xpu_int16",
"test_sparse_csr_from_dense_xpu_int32",
"test_sparse_csr_from_dense_xpu_int64",
"test_sparse_csr_from_dense_xpu_int8",
"test_sparse_csr_from_dense_xpu_uint8",
# OneDNN issues, https://github.com/intel/torch-xpu-ops/issues/253
# RuntimeError: Long is not supported in oneDNN!
# RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
Expand Down
28 changes: 27 additions & 1 deletion test/xpu/test_sparse_xpu.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Owner(s): ["module: intel"]

Check warning on line 1 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

RUFF format

Run `lintrunner -a` to apply this patch.

Check warning on line 1 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

PYFMT format

Run `lintrunner -a` to apply this patch.
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
from torch.testing._internal.common_dtype import all_types_and_complex_and
from torch.testing._internal.common_utils import run_tests

import torch

try:
from xpu_test_utils import XPUPatchForImport
except Exception as e:
Expand All @@ -10,6 +13,29 @@
with XPUPatchForImport(False):
from test_sparse import TestSparse

#@skipIfTorchDynamo()

Check warning on line 16 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

FLAKE8 E265

block comment should start with '# ' See https://www.flake8rules.com/rules/E265.html
@dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
def sparse_csr_from_dense(self, device, dtype):
dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device)
sparse = dense.to_sparse_csr()
self.assertEqual(torch.tensor([0, 2, 2, 3], dtype=torch.int64), sparse.crow_indices())
self.assertEqual(torch.tensor([0, 1, 0], dtype=torch.int64), sparse.col_indices())
self.assertEqual(torch.tensor([4, 5, 1], dtype=dtype), sparse.values())

dense = torch.tensor([[0, 0, 0], [0, 0, 1], [1, 0, 0]], dtype=dtype, device=device)
sparse = dense.to_sparse_csr()
self.assertEqual(torch.tensor([0, 0, 1, 2], dtype=torch.int64), sparse.crow_indices())
self.assertEqual(torch.tensor([2, 0], dtype=torch.int64), sparse.col_indices())
self.assertEqual(torch.tensor([1, 1], dtype=dtype), sparse.values())

dense = torch.tensor([[2, 2, 2], [2, 2, 2], [2, 2, 2]], dtype=dtype, device=device)
sparse = dense.to_sparse_csr()
self.assertEqual(torch.tensor([0, 3, 6, 9], dtype=torch.int64), sparse.crow_indices())
self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices())
self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values())

Check warning on line 36 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

RUFF W293

Blank line contains whitespace. See https://beta.ruff.rs/docs/rules/. To disable, use ` # noqa: W293`

Check failure on line 36 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

SPACES trailing spaces

This line has trailing spaces; please remove them.

Check warning on line 36 in test/xpu/test_sparse_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

FLAKE8 W293

blank line contains whitespace See https://www.flake8rules.com/rules/W293.html
TestSparse.test_sparse_csr_from_dense = sparse_csr_from_dense

instantiate_device_type_tests(TestSparse, globals(), only_for="xpu", allow_xpu=True)

if __name__ == "__main__":
Expand Down
Loading
Loading