forked from NUDT-compiler/nudt-compiler-cpp
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
708 lines
13 KiB
708 lines
13 KiB
.data
|
|
.globl N
|
|
.p2align 2
|
|
N:
|
|
.word 2048
|
|
.globl KSIZE
|
|
.p2align 2
|
|
KSIZE:
|
|
.word 5
|
|
.globl state
|
|
.p2align 2
|
|
state:
|
|
.word 0
|
|
.globl repeat_factor
|
|
.p2align 2
|
|
repeat_factor:
|
|
.word 0
|
|
.globl N_eff
|
|
.p2align 2
|
|
N_eff:
|
|
.word 0
|
|
.globl In
|
|
.p2align 2
|
|
In:
|
|
.zero 16777216
|
|
.globl Out
|
|
.p2align 2
|
|
Out:
|
|
.zero 16777216
|
|
.globl K
|
|
.p2align 2
|
|
K:
|
|
.zero 100
|
|
|
|
.text
|
|
.globl get_random
|
|
.p2align 2
|
|
get_random:
|
|
.L.get_random.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
adrp x13, state
|
|
ldr w10, [x13, #:lo12:state]
|
|
mov w8, #2048
|
|
add w8, w10, #2047
|
|
cmp w10, #0
|
|
csel w8, w8, w10, lt
|
|
asr w9, w8, #11
|
|
mov w8, #2048
|
|
msub w10, w9, w8, w10
|
|
mov w8, #0
|
|
mov w11, w8
|
|
b .L.get_random.1
|
|
.L.get_random.1:
|
|
cmp w11, w10
|
|
b.lt .L.get_random.2
|
|
b .L.get_random.3
|
|
.L.get_random.2:
|
|
adrp x13, state
|
|
ldr w9, [x13, #:lo12:state]
|
|
mov w8, #128
|
|
add w8, w9, w8
|
|
adrp x13, state
|
|
str w8, [x13, #:lo12:state]
|
|
adrp x13, state
|
|
ldr w8, [x13, #:lo12:state]
|
|
mov w9, #65535
|
|
sdiv w14, w8, w9
|
|
msub w8, w14, w9, w8
|
|
adrp x13, state
|
|
str w8, [x13, #:lo12:state]
|
|
mov w8, #1
|
|
add w8, w11, w8
|
|
mov w11, w8
|
|
b .L.get_random.1
|
|
.L.get_random.3:
|
|
adrp x13, state
|
|
ldr w9, [x13, #:lo12:state]
|
|
mov w8, #65535
|
|
sdiv w14, w9, w8
|
|
msub w8, w14, w8, w9
|
|
adrp x13, state
|
|
str w8, [x13, #:lo12:state]
|
|
adrp x13, state
|
|
ldr w8, [x13, #:lo12:state]
|
|
mov w0, w8
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
|
|
.text
|
|
.globl idx
|
|
.p2align 2
|
|
idx:
|
|
.L.idx.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
mov w8, w0
|
|
mov w9, w1
|
|
mov w10, w2
|
|
mul w8, w8, w10
|
|
add w8, w8, w9
|
|
mov w0, w8
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
|
|
.text
|
|
.globl init_matrix
|
|
.p2align 2
|
|
init_matrix:
|
|
.L.init_matrix.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
sub sp, sp, #64
|
|
str x24, [sp, #0]
|
|
str x22, [sp, #8]
|
|
str x21, [sp, #16]
|
|
str x20, [sp, #24]
|
|
str x19, [sp, #32]
|
|
str x23, [sp, #40]
|
|
mov x19, x0
|
|
adrp x13, N_eff
|
|
ldr w22, [x13, #:lo12:N_eff]
|
|
mov w8, #0
|
|
mov w9, w8
|
|
b .L.init_matrix.1
|
|
.L.init_matrix.1:
|
|
cmp w9, w22
|
|
b.lt .L.init_matrix.2
|
|
b .L.init_matrix.3
|
|
.L.init_matrix.2:
|
|
adrp x13, N_eff
|
|
ldr w10, [x13, #:lo12:N_eff]
|
|
mov w8, #2
|
|
add w8, w10, #1
|
|
cmp w10, #0
|
|
csel w8, w8, w10, lt
|
|
asr w8, w8, #1
|
|
cmp w9, w8
|
|
b.lt .L.init_matrix.4
|
|
b .L.init_matrix.5
|
|
.L.init_matrix.3:
|
|
ldr x24, [sp, #0]
|
|
ldr x22, [sp, #8]
|
|
ldr x21, [sp, #16]
|
|
ldr x20, [sp, #24]
|
|
ldr x19, [sp, #32]
|
|
ldr x23, [sp, #40]
|
|
add sp, sp, #64
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
.L.init_matrix.4:
|
|
adrp x13, N_eff
|
|
ldr w23, [x13, #:lo12:N_eff]
|
|
mov w8, #0
|
|
mov w20, w8
|
|
mov w21, w9
|
|
b .L.init_matrix.7
|
|
.L.init_matrix.5:
|
|
adrp x13, N_eff
|
|
ldr w20, [x13, #:lo12:N_eff]
|
|
mov w8, #0
|
|
mov w21, w8
|
|
mov w23, w9
|
|
b .L.init_matrix.10
|
|
.L.init_matrix.6:
|
|
mov w9, #1
|
|
add w8, w8, w9
|
|
mov w9, w8
|
|
b .L.init_matrix.1
|
|
.L.init_matrix.7:
|
|
cmp w20, w23
|
|
b.lt .L.init_matrix.8
|
|
b .L.init_matrix.9
|
|
.L.init_matrix.8:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w21
|
|
mov w1, w20
|
|
mov w2, w8
|
|
bl idx
|
|
mov w24, w0
|
|
bl get_random
|
|
mov w9, w0
|
|
mov w8, #65535
|
|
sdiv w14, w9, w8
|
|
msub w9, w14, w8, w9
|
|
sxtw x8, w24
|
|
lsl x8, x8, #2
|
|
add x8, x19, x8
|
|
str w9, [x8]
|
|
mov w8, #1
|
|
add w8, w20, w8
|
|
mov w20, w8
|
|
b .L.init_matrix.7
|
|
.L.init_matrix.9:
|
|
mov w8, w21
|
|
b .L.init_matrix.6
|
|
.L.init_matrix.10:
|
|
cmp w21, w20
|
|
b.lt .L.init_matrix.11
|
|
b .L.init_matrix.12
|
|
.L.init_matrix.11:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w23
|
|
mov w1, w21
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
add x8, x19, x8
|
|
mov w9, #-1
|
|
str w9, [x8]
|
|
mov w8, #1
|
|
add w8, w21, w8
|
|
mov w21, w8
|
|
b .L.init_matrix.10
|
|
.L.init_matrix.12:
|
|
mov w8, w23
|
|
b .L.init_matrix.6
|
|
|
|
.text
|
|
.globl init_kernel
|
|
.p2align 2
|
|
init_kernel:
|
|
.L.init_kernel.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
mov x8, x0
|
|
mov w9, #0
|
|
b .L.init_kernel.1
|
|
.L.init_kernel.1:
|
|
mov w10, #25
|
|
cmp w9, w10
|
|
b.lt .L.init_kernel.2
|
|
b .L.init_kernel.3
|
|
.L.init_kernel.2:
|
|
mov w10, #3
|
|
sdiv w14, w9, w10
|
|
msub w10, w14, w10, w9
|
|
mov w11, #1
|
|
sub w10, w10, w11
|
|
sxtw x12, w9
|
|
lsl x12, x12, #2
|
|
add x12, x8, x12
|
|
str w10, [x12]
|
|
add w9, w9, w11
|
|
b .L.init_kernel.1
|
|
.L.init_kernel.3:
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
|
|
.text
|
|
.globl conv2d
|
|
.p2align 2
|
|
conv2d:
|
|
.L.conv2d.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
sub sp, sp, #112
|
|
str x23, [sp, #0]
|
|
str x20, [sp, #8]
|
|
str x22, [sp, #16]
|
|
str x25, [sp, #24]
|
|
str x19, [sp, #32]
|
|
str x21, [sp, #40]
|
|
str x24, [sp, #48]
|
|
str x26, [sp, #56]
|
|
mov x8, x0
|
|
stur x8, [x29, #-8]
|
|
mov x21, x1
|
|
mov x19, x2
|
|
adrp x13, repeat_factor
|
|
ldr w8, [x13, #:lo12:repeat_factor]
|
|
stur w8, [x29, #-24]
|
|
mov w8, #0
|
|
b .L.conv2d.1
|
|
.L.conv2d.1:
|
|
ldur w9, [x29, #-24]
|
|
cmp w8, w9
|
|
b.lt .L.conv2d.2
|
|
b .L.conv2d.3
|
|
.L.conv2d.2:
|
|
adrp x13, N_eff
|
|
ldr w9, [x13, #:lo12:N_eff]
|
|
stur w9, [x29, #-28]
|
|
mov w9, #0
|
|
b .L.conv2d.4
|
|
.L.conv2d.3:
|
|
ldr x23, [sp, #0]
|
|
ldr x20, [sp, #8]
|
|
ldr x22, [sp, #16]
|
|
ldr x25, [sp, #24]
|
|
ldr x19, [sp, #32]
|
|
ldr x21, [sp, #40]
|
|
ldr x24, [sp, #48]
|
|
ldr x26, [sp, #56]
|
|
add sp, sp, #112
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
.L.conv2d.4:
|
|
ldur w10, [x29, #-28]
|
|
cmp w9, w10
|
|
b.lt .L.conv2d.5
|
|
b .L.conv2d.6
|
|
.L.conv2d.5:
|
|
adrp x13, N_eff
|
|
ldr w24, [x13, #:lo12:N_eff]
|
|
mov w10, #0
|
|
mov w11, w10
|
|
mov w12, w8
|
|
b .L.conv2d.7
|
|
.L.conv2d.6:
|
|
mov w9, #1
|
|
add w8, w8, w9
|
|
b .L.conv2d.1
|
|
.L.conv2d.7:
|
|
cmp w11, w24
|
|
b.lt .L.conv2d.8
|
|
b .L.conv2d.9
|
|
.L.conv2d.8:
|
|
mov w8, #0
|
|
mov w10, w8
|
|
mov w23, w8
|
|
mov w20, w11
|
|
mov w22, w9
|
|
mov w25, w12
|
|
b .L.conv2d.10
|
|
.L.conv2d.9:
|
|
mov w8, #1
|
|
add w8, w9, w8
|
|
mov w9, w8
|
|
mov w8, w12
|
|
b .L.conv2d.4
|
|
.L.conv2d.10:
|
|
mov w8, #5
|
|
cmp w10, w8
|
|
b.lt .L.conv2d.11
|
|
b .L.conv2d.12
|
|
.L.conv2d.11:
|
|
add w9, w22, w10
|
|
mov w8, #2
|
|
sub w8, w9, w8
|
|
stur w8, [x29, #-12]
|
|
mov w8, #0
|
|
stur w8, [x29, #-16]
|
|
mov w8, w10
|
|
stur w8, [x29, #-20]
|
|
b .L.conv2d.13
|
|
.L.conv2d.12:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w22
|
|
mov w1, w20
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
add x8, x21, x8
|
|
str w23, [x8]
|
|
mov w8, #1
|
|
add w8, w20, w8
|
|
mov w11, w8
|
|
mov w9, w22
|
|
mov w12, w25
|
|
b .L.conv2d.7
|
|
.L.conv2d.13:
|
|
mov w8, #5
|
|
ldur w9, [x29, #-16]
|
|
cmp w9, w8
|
|
b.lt .L.conv2d.14
|
|
b .L.conv2d.15
|
|
.L.conv2d.14:
|
|
ldur w8, [x29, #-16]
|
|
add w9, w20, w8
|
|
mov w8, #2
|
|
sub w10, w9, w8
|
|
mov w8, #0
|
|
ldur w9, [x29, #-12]
|
|
cmp w9, w8
|
|
b.ge .L.conv2d.20
|
|
mov w11, w23
|
|
b .L.conv2d.17
|
|
.L.conv2d.15:
|
|
mov w8, #1
|
|
ldur w9, [x29, #-20]
|
|
add w8, w9, w8
|
|
mov w10, w8
|
|
b .L.conv2d.10
|
|
.L.conv2d.16:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
ldur w9, [x29, #-12]
|
|
mov w0, w9
|
|
mov w1, w10
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
ldur x9, [x29, #-8]
|
|
add x8, x9, x8
|
|
ldr w26, [x8]
|
|
ldur w8, [x29, #-20]
|
|
mov w0, w8
|
|
ldur w8, [x29, #-16]
|
|
mov w1, w8
|
|
mov w8, #5
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
add x8, x19, x8
|
|
ldr w8, [x8]
|
|
mul w8, w26, w8
|
|
add w8, w23, w8
|
|
mov w11, w8
|
|
b .L.conv2d.17
|
|
.L.conv2d.17:
|
|
mov w8, #1
|
|
ldur w9, [x29, #-16]
|
|
add w8, w9, w8
|
|
stur w8, [x29, #-16]
|
|
mov w23, w11
|
|
b .L.conv2d.13
|
|
.L.conv2d.18:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
cmp w10, w8
|
|
b.lt .L.conv2d.16
|
|
mov w11, w23
|
|
b .L.conv2d.17
|
|
.L.conv2d.19:
|
|
mov w8, #0
|
|
cmp w10, w8
|
|
b.ge .L.conv2d.18
|
|
mov w11, w23
|
|
b .L.conv2d.17
|
|
.L.conv2d.20:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
ldur w9, [x29, #-12]
|
|
cmp w9, w8
|
|
b.lt .L.conv2d.19
|
|
mov w11, w23
|
|
b .L.conv2d.17
|
|
|
|
.text
|
|
.globl nonlinear
|
|
.p2align 2
|
|
nonlinear:
|
|
.L.nonlinear.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
mov x11, x0
|
|
adrp x13, N_eff
|
|
ldr w9, [x13, #:lo12:N_eff]
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mul w12, w9, w8
|
|
mov w8, #0
|
|
mov w15, w8
|
|
b .L.nonlinear.1
|
|
.L.nonlinear.1:
|
|
cmp w15, w12
|
|
b.lt .L.nonlinear.2
|
|
b .L.nonlinear.3
|
|
.L.nonlinear.2:
|
|
sxtw x8, w15
|
|
lsl x8, x8, #2
|
|
add x8, x11, x8
|
|
ldr w10, [x8]
|
|
mul w9, w10, w10
|
|
mov w8, #3
|
|
mul w8, w8, w10
|
|
add w9, w9, w8
|
|
mov w8, #7
|
|
sub w9, w9, w8
|
|
mov w8, #97
|
|
sdiv w14, w9, w8
|
|
msub w9, w14, w8, w9
|
|
sxtw x8, w15
|
|
lsl x8, x8, #2
|
|
add x8, x11, x8
|
|
str w9, [x8]
|
|
mov w8, #1
|
|
add w8, w15, w8
|
|
mov w15, w8
|
|
b .L.nonlinear.1
|
|
.L.nonlinear.3:
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
|
|
.text
|
|
.globl row_reduce
|
|
.p2align 2
|
|
row_reduce:
|
|
.L.row_reduce.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
sub sp, sp, #80
|
|
str x21, [sp, #0]
|
|
str x24, [sp, #8]
|
|
str x20, [sp, #16]
|
|
str x19, [sp, #24]
|
|
str x25, [sp, #32]
|
|
str x22, [sp, #40]
|
|
str x23, [sp, #48]
|
|
mov x23, x0
|
|
adrp x13, N_eff
|
|
ldr w22, [x13, #:lo12:N_eff]
|
|
mov w8, #0
|
|
b .L.row_reduce.1
|
|
.L.row_reduce.1:
|
|
cmp w8, w22
|
|
b.lt .L.row_reduce.2
|
|
b .L.row_reduce.3
|
|
.L.row_reduce.2:
|
|
adrp x13, N_eff
|
|
ldr w20, [x13, #:lo12:N_eff]
|
|
mov w9, #0
|
|
mov w21, w9
|
|
mov w24, w9
|
|
mov w19, w8
|
|
b .L.row_reduce.4
|
|
.L.row_reduce.3:
|
|
ldr x21, [sp, #0]
|
|
ldr x24, [sp, #8]
|
|
ldr x20, [sp, #16]
|
|
ldr x19, [sp, #24]
|
|
ldr x25, [sp, #32]
|
|
ldr x22, [sp, #40]
|
|
ldr x23, [sp, #48]
|
|
add sp, sp, #80
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
.L.row_reduce.4:
|
|
cmp w21, w20
|
|
b.lt .L.row_reduce.5
|
|
b .L.row_reduce.6
|
|
.L.row_reduce.5:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w19
|
|
mov w1, w21
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
add x8, x23, x8
|
|
ldr w8, [x8]
|
|
add w9, w24, w8
|
|
mov w8, #1
|
|
add w8, w21, w8
|
|
mov w21, w8
|
|
mov w24, w9
|
|
b .L.row_reduce.4
|
|
.L.row_reduce.6:
|
|
adrp x13, N_eff
|
|
ldr w21, [x13, #:lo12:N_eff]
|
|
mov w8, #0
|
|
mov w25, w8
|
|
mov w20, w19
|
|
b .L.row_reduce.7
|
|
.L.row_reduce.7:
|
|
cmp w25, w21
|
|
b.lt .L.row_reduce.8
|
|
b .L.row_reduce.9
|
|
.L.row_reduce.8:
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w20
|
|
mov w1, w25
|
|
mov w2, w8
|
|
bl idx
|
|
mov w19, w0
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
mov w0, w20
|
|
mov w1, w25
|
|
mov w2, w8
|
|
bl idx
|
|
mov w8, w0
|
|
sxtw x8, w8
|
|
lsl x8, x8, #2
|
|
add x8, x23, x8
|
|
ldr w8, [x8]
|
|
sub w8, w8, w24
|
|
sxtw x9, w19
|
|
lsl x9, x9, #2
|
|
add x9, x23, x9
|
|
str w8, [x9]
|
|
mov w8, #1
|
|
add w8, w25, w8
|
|
mov w25, w8
|
|
b .L.row_reduce.7
|
|
.L.row_reduce.9:
|
|
mov w8, #1
|
|
add w8, w20, w8
|
|
b .L.row_reduce.1
|
|
|
|
.text
|
|
.globl checksum
|
|
.p2align 2
|
|
checksum:
|
|
.L.checksum.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
mov x9, x0
|
|
adrp x13, N_eff
|
|
ldr w8, [x13, #:lo12:N_eff]
|
|
adrp x13, N_eff
|
|
ldr w10, [x13, #:lo12:N_eff]
|
|
mul w11, w8, w10
|
|
mov w8, #0
|
|
mov w10, w8
|
|
b .L.checksum.1
|
|
.L.checksum.1:
|
|
cmp w10, w11
|
|
b.lt .L.checksum.2
|
|
b .L.checksum.3
|
|
.L.checksum.2:
|
|
sxtw x12, w10
|
|
lsl x12, x12, #2
|
|
add x12, x9, x12
|
|
ldr w12, [x12]
|
|
add w12, w8, w12
|
|
mov w8, #1
|
|
add w8, w10, w8
|
|
mov w10, w8
|
|
mov w8, w12
|
|
b .L.checksum.1
|
|
.L.checksum.3:
|
|
mov w0, w8
|
|
ldp x29, x30, [sp], #16
|
|
ret
|
|
|
|
.text
|
|
.globl main
|
|
.p2align 2
|
|
main:
|
|
.L.main.0:
|
|
stp x29, x30, [sp, #-16]!
|
|
mov x29, sp
|
|
sub sp, sp, #32
|
|
str x19, [sp, #0]
|
|
str x20, [sp, #8]
|
|
bl getint
|
|
mov w8, w0
|
|
adrp x13, state
|
|
str w8, [x13, #:lo12:state]
|
|
bl getint
|
|
mov w8, w0
|
|
adrp x13, repeat_factor
|
|
str w8, [x13, #:lo12:repeat_factor]
|
|
adrp x13, state
|
|
ldr w8, [x13, #:lo12:state]
|
|
mov w9, #513
|
|
sdiv w14, w8, w9
|
|
msub w8, w14, w9, w8
|
|
mov w9, #64
|
|
add w8, w8, w9
|
|
adrp x13, N_eff
|
|
str w8, [x13, #:lo12:N_eff]
|
|
mov w8, #134
|
|
mov w0, w8
|
|
bl _sysy_starttime
|
|
adrp x19, In
|
|
add x19, x19, :lo12:In
|
|
mov x0, x19
|
|
bl init_matrix
|
|
adrp x20, K
|
|
add x20, x20, :lo12:K
|
|
mov x0, x20
|
|
bl init_kernel
|
|
mov x0, x19
|
|
adrp x19, Out
|
|
add x19, x19, :lo12:Out
|
|
mov x1, x19
|
|
mov x2, x20
|
|
bl conv2d
|
|
mov x0, x19
|
|
bl nonlinear
|
|
mov x0, x19
|
|
bl row_reduce
|
|
mov x0, x19
|
|
bl checksum
|
|
mov w19, w0
|
|
mov w8, #145
|
|
mov w0, w8
|
|
bl _sysy_stoptime
|
|
mov w0, w19
|
|
bl putint
|
|
mov w8, #10
|
|
mov w0, w8
|
|
bl putch
|
|
mov w8, #0
|
|
mov w0, w8
|
|
ldr x19, [sp, #0]
|
|
ldr x20, [sp, #8]
|
|
add sp, sp, #32
|
|
ldp x29, x30, [sp], #16
|
|
ret
|