2 # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # X25519 lower-level primitives for x86_64.
20 # This module implements radix 2^51 multiplication and squaring, and
21 # radix 2^64 multiplication, squaring, addition, subtraction and final
22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
23 # as Broadwell. On related note one should mention that there are
24 # vector implementations that provide significantly better performance
25 # on some processors(*), but they are large and overly complex. Which
26 # in combination with them being effectively processor-specific makes
27 # the undertaking hard to justify. The goal for this implementation
28 # is rather versatility and simplicity [and ultimately formal
31 # (*) For example sandy2x should provide ~30% improvement on Sandy
32 # Bridge, but only nominal ~5% on Haswell [and big loss on
33 # Broadwell and successors].
35 ######################################################################
36 # Improvement coefficients:
38 # amd64-51(*) gcc-5.x(**)
41 # Sandy Bridge -3% +11%
43 # Broadwell(***) +30% +35%
44 # Skylake(***) +33% +47%
45 # Silvermont +20% +26%
48 # Ryzen(***) +43% +40%
51 # (*) amd64-51 is popular assembly implementation with 2^51 radix,
52 # only multiplication and squaring subroutines were linked
53 # for comparison, but not complete ladder step; gain on most
54 # processors is because this module refrains from shld, and
55 # minor regression on others is because this does result in
56 # higher instruction count;
57 # (**) compiler is free to inline functions, in assembly one would
58 # need to implement ladder step to do that, and it will improve
59 # performance by several percent;
60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
61 # C implementation, so that comparison is always against
66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73 die "can't locate x86_64-xlate.pl";
75 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
83 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
88 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
93 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
94 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
101 .globl x25519_fe51_mul
102 .type x25519_fe51_mul,\@function,3
119 .cfi_adjust_cfa_offset 40
122 mov 8*0(%rsi),%rax # f[0]
123 mov 8*0(%rdx),%r11 # load g[0-4]
129 mov %rdi,8*4(%rsp) # offload 1st argument
131 mulq %r11 # f[0]*g[0]
132 mov %r11,8*0(%rsp) # offload g[0]
133 mov %rax,%rbx # %rbx:%rcx = h0
136 mulq %r12 # f[0]*g[1]
137 mov %r12,8*1(%rsp) # offload g[1]
138 mov %rax,%r8 # %r8:%r9 = h1
140 lea (%r14,%r14,8),%r15
142 mulq %r13 # f[0]*g[2]
143 mov %r13,8*2(%rsp) # offload g[2]
144 mov %rax,%r10 # %r10:%r11 = h2
146 lea (%r14,%r15,2),%rdi # g[4]*19
148 mulq %rbp # f[0]*g[3]
149 mov %rax,%r12 # %r12:%r13 = h3
150 mov 8*0(%rsi),%rax # f[0]
152 mulq %r14 # f[0]*g[4]
153 mov %rax,%r14 # %r14:%r15 = h4
154 mov 8*1(%rsi),%rax # f[1]
157 mulq %rdi # f[1]*g[4]*19
159 mov 8*2(%rsi),%rax # f[2]
161 mulq %rdi # f[2]*g[4]*19
163 mov 8*3(%rsi),%rax # f[3]
165 mulq %rdi # f[3]*g[4]*19
167 mov 8*4(%rsi),%rax # f[4]
169 mulq %rdi # f[4]*g[4]*19
170 imulq \$19,%rbp,%rdi # g[3]*19
172 mov 8*1(%rsi),%rax # f[1]
174 mulq %rbp # f[1]*g[3]
175 mov 8*2(%rsp),%rbp # g[2]
177 mov 8*2(%rsi),%rax # f[2]
180 mulq %rdi # f[2]*g[3]*19
182 mov 8*3(%rsi),%rax # f[3]
184 mulq %rdi # f[3]*g[3]*19
186 mov 8*4(%rsi),%rax # f[4]
188 mulq %rdi # f[4]*g[3]*19
189 imulq \$19,%rbp,%rdi # g[2]*19
191 mov 8*1(%rsi),%rax # f[1]
193 mulq %rbp # f[1]*g[2]
195 mov 8*2(%rsi),%rax # f[2]
197 mulq %rbp # f[2]*g[2]
198 mov 8*1(%rsp),%rbp # g[1]
200 mov 8*3(%rsi),%rax # f[3]
203 mulq %rdi # f[3]*g[2]*19
205 mov 8*4(%rsi),%rax # f[3]
207 mulq %rdi # f[4]*g[2]*19
209 mov 8*1(%rsi),%rax # f[1]
211 mulq %rbp # f[1]*g[1]
214 mov 8*2(%rsi),%rax # f[2]
216 mulq %rbp # f[2]*g[1]
218 mov 8*3(%rsi),%rax # f[3]
220 mulq %rbp # f[3]*g[1]
221 mov 8*0(%rsp),%rbp # g[0]
223 mov 8*4(%rsi),%rax # f[4]
226 mulq %rdi # f[4]*g[1]*19
228 mov 8*1(%rsi),%rax # f[1]
232 mov 8*2(%rsi),%rax # f[2]
236 mov 8*3(%rsi),%rax # f[3]
240 mov 8*4(%rsi),%rax # f[4]
242 mulq %rbp # f[4]*g[0]
246 mov 8*4(%rsp),%rdi # restore 1st argument
250 .size x25519_fe51_mul,.-x25519_fe51_mul
252 .globl x25519_fe51_sqr
253 .type x25519_fe51_sqr,\@function,2
270 .cfi_adjust_cfa_offset 40
273 mov 8*0(%rsi),%rax # g[0]
274 mov 8*2(%rsi),%r15 # g[2]
275 mov 8*4(%rsi),%rbp # g[4]
277 mov %rdi,8*4(%rsp) # offload 1st argument
279 mulq %rax # g[0]*g[0]
281 mov 8*1(%rsi),%rax # g[1]
283 mulq %r14 # 2*g[0]*g[1]
286 mov %r15,8*0(%rsp) # offload g[2]
288 mulq %r14 # 2*g[0]*g[2]
292 imulq \$19,%rbp,%rdi # g[4]*19
293 mulq %r14 # 2*g[0]*g[3]
297 mulq %r14 # 2*g[0]*g[4]
302 mulq %rdi # g[4]*g[4]*19
304 mov 8*1(%rsi),%rax # g[1]
307 mov 8*3(%rsi),%rsi # g[3]
309 mulq %rax # g[1]*g[1]
311 mov 8*0(%rsp),%rax # g[2]
313 mulq %rbp # 2*g[1]*g[2]
317 mulq %rsi # 2*g[1]*g[3]
321 imulq \$19,%rsi,%rbp # g[3]*19
322 mulq %rdi # 2*g[1]*g[4]*19
327 mulq %rdi # 2*g[3]*g[4]*19
331 mulq %rbp # g[3]*g[3]*19
333 mov 8*0(%rsp),%rax # g[2]
337 mulq %rax # g[2]*g[2]
341 mulq %rsi # 2*g[2]*g[3]*19
345 mulq %rdi # 2*g[2]*g[4]*19
349 mov 8*4(%rsp),%rdi # restore 1st argument
354 mov \$0x7ffffffffffff,%rbp
359 and %rbp,%rdx # %rdx = g2 = h2 & mask
360 or %r10,%r11 # h2>>51
362 adc \$0,%r13 # h3 += h2>>51
367 and %rbp,%rax # %rax = g0 = h0 & mask
368 or %rbx,%rcx # h0>>51
369 add %rcx,%r8 # h1 += h0>>51
375 and %rbp,%rbx # %rbx = g3 = h3 & mask
376 or %r12,%r13 # h3>>51
377 add %r13,%r14 # h4 += h3>>51
383 and %rbp,%rcx # %rcx = g1 = h1 & mask
385 add %r9,%rdx # g2 += h1>>51
390 and %rbp,%r10 # %r10 = g4 = h0 & mask
391 or %r14,%r15 # h0>>51
393 lea (%r15,%r15,8),%r14
394 lea (%r15,%r14,2),%r15
395 add %r15,%rax # g0 += (h0>>51)*19
398 and %rbp,%rdx # g2 &= mask
400 add %r8,%rbx # g3 += g2>>51
403 and %rbp,%rax # g0 &= mask
405 add %r9,%rcx # g1 += g0>>51
407 mov %rax,8*0(%rdi) # save the result
426 .cfi_adjust_cfa_offset 88
430 .size x25519_fe51_sqr,.-x25519_fe51_sqr
432 .globl x25519_fe51_mul121666
433 .type x25519_fe51_mul121666,\@function,2
435 x25519_fe51_mul121666:
450 .cfi_adjust_cfa_offset 40
451 .Lfe51_mul121666_body:
455 mov %rax,%rbx # %rbx:%rcx = h0
459 mov %rax,%r8 # %r8:%r9 = h1
463 mov %rax,%r10 # %r10:%r11 = h2
467 mov %rax,%r12 # %r12:%r13 = h3
468 mov \$121666,%eax # f[0]
471 mov %rax,%r14 # %r14:%r15 = h4
475 .Lfe51_mul121666_epilogue:
477 .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
479 ########################################################################
480 # Base 2^64 subroutines modulo 2*(2^255-19)
483 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
486 .extern OPENSSL_ia32cap_P
487 .globl x25519_fe64_eligible
488 .type x25519_fe64_eligible,\@abi-omnipotent
490 x25519_fe64_eligible:
492 mov OPENSSL_ia32cap_P+8(%rip),%ecx
499 .size x25519_fe64_eligible,.-x25519_fe64_eligible
501 .globl x25519_fe64_mul
502 .type x25519_fe64_mul,\@function,3
518 push %rdi # offload dst
521 .cfi_adjust_cfa_offset 16
525 mov 8*0(%rdx),%rbp # b[0]
526 mov 8*0(%rsi),%rdx # a[0]
527 mov 8*1(%rax),%rcx # b[1]
528 mov 8*2(%rax),$acc6 # b[2]
529 mov 8*3(%rax),$acc7 # b[3]
531 mulx %rbp,$acc0,%rax # a[0]*b[0]
532 xor %edi,%edi # cf=0,of=0
533 mulx %rcx,$acc1,%rbx # a[0]*b[1]
535 mulx $acc6,$acc2,%rax # a[0]*b[2]
537 mulx $acc7,$acc3,$acc4 # a[0]*b[3]
538 mov 8*1(%rsi),%rdx # a[1]
540 mov $acc6,(%rsp) # offload b[2]
541 adcx %rdi,$acc4 # cf=0
543 mulx %rbp,%rax,%rbx # a[1]*b[0]
546 mulx %rcx,%rax,%rbx # a[1]*b[1]
549 mulx $acc6,%rax,%rbx # a[1]*b[2]
552 mulx $acc7,%rax,$acc5 # a[1]*b[3]
553 mov 8*2(%rsi),%rdx # a[2]
555 adcx %rdi,$acc5 # cf=0
556 adox %rdi,$acc5 # of=0
558 mulx %rbp,%rax,%rbx # a[2]*b[0]
561 mulx %rcx,%rax,%rbx # a[2]*b[1]
564 mulx $acc6,%rax,%rbx # a[2]*b[2]
567 mulx $acc7,%rax,$acc6 # a[2]*b[3]
568 mov 8*3(%rsi),%rdx # a[3]
570 adox %rdi,$acc6 # of=0
571 adcx %rdi,$acc6 # cf=0
573 mulx %rbp,%rax,%rbx # a[3]*b[0]
576 mulx %rcx,%rax,%rbx # a[3]*b[1]
579 mulx (%rsp),%rax,%rbx # a[3]*b[2]
582 mulx $acc7,%rax,$acc7 # a[3]*b[3]
585 adcx %rdi,$acc7 # cf=0
586 adox %rdi,$acc7 # of=0
591 .size x25519_fe64_mul,.-x25519_fe64_mul
593 .globl x25519_fe64_sqr
594 .type x25519_fe64_sqr,\@function,2
610 push %rdi # offload dst
613 .cfi_adjust_cfa_offset 16
616 mov 8*0(%rsi),%rdx # a[0]
617 mov 8*1(%rsi),%rcx # a[1]
618 mov 8*2(%rsi),%rbp # a[2]
619 mov 8*3(%rsi),%rsi # a[3]
621 ################################################################
622 mulx %rdx,$acc0,$acc7 # a[0]*a[0]
623 mulx %rcx,$acc1,%rax # a[0]*a[1]
624 xor %edi,%edi # cf=0,of=0
625 mulx %rbp,$acc2,%rbx # a[0]*a[2]
627 mulx %rsi,$acc3,$acc4 # a[0]*a[3]
630 adcx %rdi,$acc4 # cf=0
632 ################################################################
633 mulx %rbp,%rax,%rbx # a[1]*a[2]
636 mulx %rsi,%rax,$acc5 # a[1]*a[3]
641 ################################################################
642 mulx %rsi,%rax,$acc6 # a[2]*a[3]
645 adcx %rdi,$acc6 # cf=0
646 adox %rdi,$acc6 # of=0
648 adcx $acc1,$acc1 # acc1:6<<1
651 mulx %rdx,%rax,%rbx # a[1]*a[1]
657 mulx %rdx,%rax,%rbx # a[2]*a[2]
663 mulx %rdx,%rax,$acc7 # a[3]*a[3]
666 adcx %rdi,$acc7 # cf=0
667 adox %rdi,$acc7 # of=0
681 mulx $acc7,%rax,$acc4
686 mov 8*2(%rsp),%rdi # restore dst
694 sbb %rax,%rax # cf -> mask
716 .cfi_adjust_cfa_offset 88
720 .size x25519_fe64_sqr,.-x25519_fe64_sqr
722 .globl x25519_fe64_mul121666
723 .type x25519_fe64_mul121666,\@function,2
725 x25519_fe64_mul121666:
726 .Lfe64_mul121666_body:
729 mulx 8*0(%rsi),$acc0,%rcx
730 mulx 8*1(%rsi),$acc1,%rax
732 mulx 8*2(%rsi),$acc2,%rcx
734 mulx 8*3(%rsi),$acc3,%rax
745 sbb %rax,%rax # cf -> mask
754 .Lfe64_mul121666_epilogue:
757 .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
759 .globl x25519_fe64_add
760 .type x25519_fe64_add,\@function,3
775 sbb %rax,%rax # cf -> mask
784 sbb %rax,%rax # cf -> mask
794 .size x25519_fe64_add,.-x25519_fe64_add
796 .globl x25519_fe64_sub
797 .type x25519_fe64_sub,\@function,3
812 sbb %rax,%rax # cf -> mask
821 sbb %rax,%rax # cf -> mask
831 .size x25519_fe64_sub,.-x25519_fe64_sub
833 .globl x25519_fe64_tobytes
834 .type x25519_fe64_tobytes,\@function,2
844 ################################# reduction modulo 2^255-19
845 lea ($acc3,$acc3),%rax
846 sar \$63,$acc3 # most significant bit -> mask
847 shr \$1,%rax # most significant bit cleared
849 add \$19,$acc3 # compare to modulus in the same go
856 lea (%rax,%rax),$acc3
857 sar \$63,%rax # most significant bit -> mask
858 shr \$1,$acc3 # most significant bit cleared
875 .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
879 .globl x25519_fe64_eligible
880 .type x25519_fe64_eligible,\@abi-omnipotent
882 x25519_fe64_eligible:
887 .size x25519_fe64_eligible,.-x25519_fe64_eligible
889 .globl x25519_fe64_mul
890 .type x25519_fe64_mul,\@abi-omnipotent
891 .globl x25519_fe64_sqr
892 .globl x25519_fe64_mul121666
893 .globl x25519_fe64_add
894 .globl x25519_fe64_sub
895 .globl x25519_fe64_tobytes
898 x25519_fe64_mul121666:
903 .byte 0x0f,0x0b # ud2
906 .size x25519_fe64_mul,.-x25519_fe64_mul
910 .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
913 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
914 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
922 .extern __imp_RtlVirtualUnwind
924 .type short_handler,\@abi-omnipotent
938 mov 120($context),%rax # pull context->Rax
939 mov 248($context),%rbx # pull context->Rip
941 mov 8($disp),%rsi # disp->ImageBase
942 mov 56($disp),%r11 # disp->HandlerData
944 mov 0(%r11),%r10d # HandlerData[0]
945 lea (%rsi,%r10),%r10 # end of prologue label
946 cmp %r10,%rbx # context->Rip<end of prologue label
949 mov 152($context),%rax # pull context->Rsp
950 jmp .Lcommon_seh_tail
951 .size short_handler,.-short_handler
953 .type full_handler,\@abi-omnipotent
967 mov 120($context),%rax # pull context->Rax
968 mov 248($context),%rbx # pull context->Rip
970 mov 8($disp),%rsi # disp->ImageBase
971 mov 56($disp),%r11 # disp->HandlerData
973 mov 0(%r11),%r10d # HandlerData[0]
974 lea (%rsi,%r10),%r10 # end of prologue label
975 cmp %r10,%rbx # context->Rip<end of prologue label
978 mov 152($context),%rax # pull context->Rsp
980 mov 4(%r11),%r10d # HandlerData[1]
981 lea (%rsi,%r10),%r10 # epilogue label
982 cmp %r10,%rbx # context->Rip>=epilogue label
983 jae .Lcommon_seh_tail
985 mov 8(%r11),%r10d # HandlerData[2]
994 mov %rbx,144($context) # restore context->Rbx
995 mov %rbp,160($context) # restore context->Rbp
996 mov %r12,216($context) # restore context->R12
997 mov %r13,224($context) # restore context->R13
998 mov %r14,232($context) # restore context->R14
999 mov %r15,240($context) # restore context->R15
1004 mov %rax,152($context) # restore context->Rsp
1005 mov %rsi,168($context) # restore context->Rsi
1006 mov %rdi,176($context) # restore context->Rdi
1008 mov 40($disp),%rdi # disp->ContextRecord
1009 mov $context,%rsi # context
1010 mov \$154,%ecx # sizeof(CONTEXT)
1011 .long 0xa548f3fc # cld; rep movsq
1014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1015 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1016 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1018 mov 40(%rsi),%r10 # disp->ContextRecord
1019 lea 56(%rsi),%r11 # &disp->HandlerData
1020 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1021 mov %r10,32(%rsp) # arg5
1022 mov %r11,40(%rsp) # arg6
1023 mov %r12,48(%rsp) # arg7
1024 mov %rcx,56(%rsp) # arg8, (NULL)
1025 call *__imp_RtlVirtualUnwind(%rip)
1027 mov \$1,%eax # ExceptionContinueSearch
1039 .size full_handler,.-full_handler
1043 .rva .LSEH_begin_x25519_fe51_mul
1044 .rva .LSEH_end_x25519_fe51_mul
1045 .rva .LSEH_info_x25519_fe51_mul
1047 .rva .LSEH_begin_x25519_fe51_sqr
1048 .rva .LSEH_end_x25519_fe51_sqr
1049 .rva .LSEH_info_x25519_fe51_sqr
1051 .rva .LSEH_begin_x25519_fe51_mul121666
1052 .rva .LSEH_end_x25519_fe51_mul121666
1053 .rva .LSEH_info_x25519_fe51_mul121666
1055 $code.=<<___ if ($addx);
1056 .rva .LSEH_begin_x25519_fe64_mul
1057 .rva .LSEH_end_x25519_fe64_mul
1058 .rva .LSEH_info_x25519_fe64_mul
1060 .rva .LSEH_begin_x25519_fe64_sqr
1061 .rva .LSEH_end_x25519_fe64_sqr
1062 .rva .LSEH_info_x25519_fe64_sqr
1064 .rva .LSEH_begin_x25519_fe64_mul121666
1065 .rva .LSEH_end_x25519_fe64_mul121666
1066 .rva .LSEH_info_x25519_fe64_mul121666
1068 .rva .LSEH_begin_x25519_fe64_add
1069 .rva .LSEH_end_x25519_fe64_add
1070 .rva .LSEH_info_x25519_fe64_add
1072 .rva .LSEH_begin_x25519_fe64_sub
1073 .rva .LSEH_end_x25519_fe64_sub
1074 .rva .LSEH_info_x25519_fe64_sub
1076 .rva .LSEH_begin_x25519_fe64_tobytes
1077 .rva .LSEH_end_x25519_fe64_tobytes
1078 .rva .LSEH_info_x25519_fe64_tobytes
1083 .LSEH_info_x25519_fe51_mul:
1086 .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
1088 .LSEH_info_x25519_fe51_sqr:
1091 .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
1093 .LSEH_info_x25519_fe51_mul121666:
1096 .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1099 $code.=<<___ if ($addx);
1100 .LSEH_info_x25519_fe64_mul:
1103 .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
1105 .LSEH_info_x25519_fe64_sqr:
1108 .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
1110 .LSEH_info_x25519_fe64_mul121666:
1113 .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1114 .LSEH_info_x25519_fe64_add:
1117 .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
1118 .LSEH_info_x25519_fe64_sub:
1121 .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
1122 .LSEH_info_x25519_fe64_tobytes:
1125 .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
1129 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1131 close STDOUT or die "error closing STDOUT: $!";