# Save compilation stages - some of the stages identified here are specific to NVIDIA devices:
with
open(
'triton_IR.txt'
,
'w'
)
as
f:
print(triton_kernel.asm[
'ttir'
], file=f)
with
open(
'triton_TTGIR.txt'
,
'w'
)
as
f:
print(triton_kernel.asm[
'ttgir'
], file=f)
with
open(
'triton_LLVMIR.txt'
,
'w'
)
as
f:
print(triton_kernel.asm[
'llir'
], file=f)
with
open(
'triton_PTX.ptx'
,
'w'
)
as
f:
print(triton_kernel.asm[
'ptx'
], file=f)
with
open(
'triton_cubin.txt'
,
'w'
)
as
f:
print(triton_kernel.asm[
'cubin'
], file=f)
return
output
torch.manual_seed(
0
)
size =
98432
x = torch.rand(size, device=
'cuda'
)
y = torch.rand(size, device=
'cuda'
)
output_torch = x + y
output_triton = add(x, y)
print(output_torch)
print(output_triton)
print(
f'The maximum difference between torch and triton is '
f'
{torch.max(torch.abs(output_torch - output_triton))}
'
)