-
Notifications
You must be signed in to change notification settings - Fork 645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to access memory beyond the largest legal address. code: 0x29 #19564
Comments
I am using f1e1866 and same issue is present in 70B as well |
This error is not present in 8B anymore with a43d893 but seeing issue with 70B prefill sharded model with tp8. Here is simple IR to produce the issue #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module @module {
util.global private @__auto.token_embd.weight {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x8192xf16>
util.global private @__auto.blk.0.attn_output.weight.shard.0 {stream.affinity = #hal.device.promise<@__device_0>} = #stream.parameter.named<"model"::"blk.0.attn_output.weight.shard.0"> : tensor<8192x1024xf16>
func.func @prefill_bs4(%arg0: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, %arg1: !torch.vtensor<[4],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, %arg2: !torch.vtensor<[4,?],si64> {iree.abi.affinity = #hal.device.promise<@__device_0>}, %arg3: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_0>}, %arg4: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_1>}, %arg5: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_2>}, %arg6: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_3>}, %arg7: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_4>}, %arg8: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_5>}, %arg9: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_6>}, %arg10: !torch.tensor<[?,655360],f16> {iree.abi.affinity = #hal.device.promise<@__device_7>}) -> !torch.vtensor<[4,?,8,128],f16> attributes {torch.assume_strict_symbolic_shapes} {
%__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x8192xf16>
%0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x8192xf16> -> !torch.vtensor<[128256,8192],f16>
%__auto.blk.0.attn_output.weight.shard.0 = util.global.load @__auto.blk.0.attn_output.weight.shard.0 : tensor<8192x1024xf16>
%40 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight.shard.0 : tensor<8192x1024xf16> -> !torch.vtensor<[8192,1024],f16>
%5793 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
torch.bind_symbolic_shape %arg0, [%5793], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],si64>
torch.bind_symbolic_shape %arg2, [%5793], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64>
%5797 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[4,?],si64> -> tensor<4x?xi64>
%c1 = arith.constant 1 : index
%dim = tensor.dim %5797, %c1 : tensor<4x?xi64>
%5798 = flow.tensor.transfer %5797 : tensor<4x?xi64>{%dim} to #hal.device.promise<@__device_0>
%5799 = torch_c.from_builtin_tensor %5798 : tensor<4x?xi64> -> !torch.vtensor<[4,?],si64>
%int-1 = torch.constant.int -1
%false = torch.constant.bool false
%false_30 = torch.constant.bool false
%5845 = torch.aten.embedding %0, %5799, %int-1, %false, %false_30 : !torch.vtensor<[128256,8192],f16>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,8192],f16>
%int1_134 = torch.constant.int 1
%5949 = torch.aten.size.int %arg0, %int1_134 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int
%int4 = torch.constant.int 4
%5950 = torch.aten.mul.int %int4, %5949 : !torch.int, !torch.int -> !torch.int
%int8192 = torch.constant.int 8192
%5951 = torch.prim.ListConstruct %5950, %int8192 : (!torch.int, !torch.int) -> !torch.list<int>
%5952 = torch.aten.view %5845, %5951 : !torch.vtensor<[4,?,8192],f16>, !torch.list<int> -> !torch.vtensor<[?,8192],f16>
%5953 = torch.aten.mm %5952, %40 : !torch.vtensor<[?,8192],f16>, !torch.vtensor<[8192,1024],f16> -> !torch.vtensor<[?,1024],f16>
%int4_135 = torch.constant.int 4
%int1024 = torch.constant.int 1024
%5954 = torch.prim.ListConstruct %int4_135, %5949, %int1024 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%5955 = torch.aten.view %5953, %5954 : !torch.vtensor<[?,1024],f16>, !torch.list<int> -> !torch.vtensor<[4,?,1024],f16>
%int4_236 = torch.constant.int 4
%int8 = torch.constant.int 8
%int128_237 = torch.constant.int 128
%6103 = torch.prim.ListConstruct %int4_236, %5949, %int8, %int128_237 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%6104 = torch.aten.view %5955, %6103 : !torch.vtensor<[4,?,1024],f16>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f16>
return %6104 : !torch.vtensor<[4,?,8,128],f16>
}
} commands:
|
It looks like the repro uses the inputs from decode when running prefill. I think this is the cause if the issue. I ran the repro with a modified command and didn't see an issue: iree-benchmark-module \
--hip_use_streams=true \
--module=70b_prefill_sharded.vmfb \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank0.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank1.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank2.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank3.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank4.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank5.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank6.irpa \
--parameters=model=/data/llama3.1/weights/70b/fp16/tp8/llama3.1_70b_fp16_tp8_parameters.rank7.irpa \
--device=hip://0 \
--device=hip://1 \
--device=hip://2 \
--device=hip://3 \
--device=hip://4 \
--device=hip://5 \
--device=hip://6 \
--device=hip://7 \
--function=prefill_bs4 \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8/tokens.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//seq_lens.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//seq_block_ids.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_0.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_1.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_2.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_3.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_4.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_5.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_6.npy \
--input=@/data/llama3.1/weights/70b/prefill_args_bs4_128_stride_32_tp8//cs_f16_shard_7.npy |
Edit: both prefill/decode are hitting an assert in iree's runtime when running with the modified command. I think this may be related to other llama issues (#19573)
Update: tested with #19583 and could successfully run |
Fixed with #19583 |
What happened?
For Llama 8B-FP16, prefill, sharded, getting following error during runtime
Steps to reproduce your issue
What component(s) does this issue relate to?
Runtime
Version information
No response
Additional context
No response
The text was updated successfully, but these errors were encountered: