10 #include <orc/orcprogram.h>
11 #include <orc/orcx86.h>
12 #include <orc/orcsse.h>
13 #include <orc/orcutils.h>
14 #include <orc/orcdebug.h>
19 #define ORC_SSE_ALIGNED_DEST_CUTOFF 64
21 void orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update);
23 void orc_compiler_sse_init (OrcCompiler *compiler);
24 unsigned int orc_compiler_sse_get_default_flags (void);
25 void orc_compiler_sse_assemble (OrcCompiler *compiler);
26 void orc_compiler_sse_register_rules (OrcTarget *target);
27 void orc_sse_emit_invariants (OrcCompiler *compiler);
30 void orc_compiler_rewrite_vars (OrcCompiler *compiler);
31 void orc_compiler_dump (OrcCompiler *compiler);
32 void sse_load_constant (OrcCompiler *compiler, int reg, int size, int value);
33 void sse_load_constant_long (OrcCompiler *compiler, int reg,
34 OrcConstant *constant);
35 static const char * sse_get_flag_name (int shift);
37 static OrcTarget sse_target = {
39 #if defined(HAVE_I386) || defined(HAVE_AMD64)
45 orc_compiler_sse_get_default_flags,
46 orc_compiler_sse_init,
47 orc_compiler_sse_assemble,
54 sse_load_constant_long
58 extern int orc_x86_sse_flags;
59 extern int orc_x86_mmx_flags;
64 #if defined(HAVE_AMD64) || defined(HAVE_I386)
65 /* initializes cache information */
66 orc_sse_get_cpu_flags ();
69 #if defined(HAVE_I386)
71 if (!(orc_x86_sse_flags & ORC_TARGET_SSE_SSE2)) {
72 sse_target.executable = FALSE;
75 if (!(orc_x86_mmx_flags & ORC_TARGET_MMX_MMX)) {
76 mmx_target.executable = FALSE;
81 orc_target_register (&sse_target);
83 orc_compiler_sse_register_rules (&sse_target);
87 orc_compiler_sse_get_default_flags (void)
89 unsigned int flags = 0;
92 flags |= ORC_TARGET_SSE_64BIT;
94 if (_orc_compiler_flag_debug) {
95 flags |= ORC_TARGET_SSE_FRAME_POINTER;
98 #if defined(HAVE_AMD64) || defined(HAVE_I386)
100 flags |= orc_x86_sse_flags;
102 flags |= orc_x86_mmx_flags;
106 flags |= ORC_TARGET_SSE_SSE2;
107 flags |= ORC_TARGET_SSE_SSE3;
108 flags |= ORC_TARGET_SSE_SSSE3;
110 flags |= ORC_TARGET_MMX_MMX;
111 flags |= ORC_TARGET_MMX_3DNOW;
119 sse_get_flag_name (int shift)
121 static const char *flags[] = {
123 "sse2", "sse3", "ssse3", "sse41", "sse42", "sse4a", "sse5",
124 "frame_pointer", "short_jumps", "64bit"
126 "mmx", "mmxext", "3dnow", "3dnowext", "ssse3", "sse41", "",
127 "frame_pointer", "short_jumps", "64bit"
131 if (shift >= 0 && shift < sizeof(flags)/sizeof(flags[0])) {
139 orc_compiler_sse_init (OrcCompiler *compiler)
143 if (compiler->target_flags & ORC_TARGET_SSE_64BIT) {
144 compiler->is_64bit = TRUE;
146 if (compiler->target_flags & ORC_TARGET_SSE_FRAME_POINTER) {
147 compiler->use_frame_pointer = TRUE;
149 if (!(compiler->target_flags & ORC_TARGET_SSE_SHORT_JUMPS)) {
150 compiler->long_jumps = TRUE;
154 if (compiler->is_64bit) {
155 for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
156 compiler->valid_regs[i] = 1;
158 compiler->valid_regs[X86_ESP] = 0;
160 for(i=X86_XMM0;i<X86_XMM0+16;i++){
161 compiler->valid_regs[i] = 1;
164 for(i=X86_XMM0;i<X86_XMM0+8;i++){
165 compiler->valid_regs[i] = 1;
168 compiler->save_regs[X86_EBX] = 1;
169 compiler->save_regs[X86_EBP] = 1;
170 compiler->save_regs[X86_R12] = 1;
171 compiler->save_regs[X86_R13] = 1;
172 compiler->save_regs[X86_R14] = 1;
173 compiler->save_regs[X86_R15] = 1;
175 compiler->save_regs[X86_EDI] = 1;
176 compiler->save_regs[X86_ESI] = 1;
177 for(i=X86_XMM0+6;i<X86_XMM0+16;i++){
178 compiler->save_regs[i] = 1;
182 for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+8;i++){
183 compiler->valid_regs[i] = 1;
185 compiler->valid_regs[X86_ESP] = 0;
186 if (compiler->use_frame_pointer) {
187 compiler->valid_regs[X86_EBP] = 0;
189 for(i=X86_XMM0;i<X86_XMM0+8;i++){
190 compiler->valid_regs[i] = 1;
192 compiler->save_regs[X86_EBX] = 1;
193 compiler->save_regs[X86_EDI] = 1;
194 compiler->save_regs[X86_EBP] = 1;
197 compiler->alloc_regs[i] = 0;
198 compiler->used_regs[i] = 0;
201 if (compiler->is_64bit) {
203 compiler->exec_reg = X86_ECX;
204 compiler->gp_tmpreg = X86_EDX;
206 compiler->exec_reg = X86_EDI;
207 compiler->gp_tmpreg = X86_ECX;
210 compiler->gp_tmpreg = X86_ECX;
211 if (compiler->use_frame_pointer) {
212 compiler->exec_reg = X86_EBX;
214 compiler->exec_reg = X86_EBP;
217 compiler->valid_regs[compiler->gp_tmpreg] = 0;
218 compiler->valid_regs[compiler->exec_reg] = 0;
220 switch (compiler->max_var_size) {
222 compiler->loop_shift = 4;
225 compiler->loop_shift = 3;
228 compiler->loop_shift = 2;
231 compiler->loop_shift = 1;
234 ORC_ERROR("unhandled max var size %d", compiler->max_var_size);
238 compiler->loop_shift--;
241 /* This limit is arbitrary, but some large functions run slightly
242 slower when unrolled (ginger Core2 6,15,6), and only some small
243 functions run faster when unrolled. Most are the same speed. */
244 if (compiler->n_insns <= 10) {
245 compiler->unroll_shift = 1;
247 if (!compiler->long_jumps) {
248 compiler->unroll_shift = 0;
250 if (compiler->loop_shift == 0) {
251 /* FIXME something is broken with loop_shift=0, unroll_shift=1 */
252 compiler->unroll_shift = 0;
254 compiler->alloc_loop_counter = TRUE;
255 compiler->allow_gp_on_stack = TRUE;
258 for(i=0;i<compiler->n_insns;i++){
259 OrcInstruction *insn = compiler->insns + i;
260 OrcStaticOpcode *opcode = insn->opcode;
262 if (strcmp (opcode->name, "ldreslinb") == 0 ||
263 strcmp (opcode->name, "ldreslinl") == 0 ||
264 strcmp (opcode->name, "ldresnearb") == 0 ||
265 strcmp (opcode->name, "ldresnearl") == 0) {
266 compiler->vars[insn->src_args[0]].need_offset_reg = TRUE;
273 sse_save_accumulators (OrcCompiler *compiler)
279 for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
280 OrcVariable *var = compiler->vars + i;
282 if (compiler->vars[i].name == NULL) continue;
283 switch (compiler->vars[i].vartype) {
284 case ORC_VAR_TYPE_ACCUMULATOR:
285 src = compiler->vars[i].alloc;
286 tmp = orc_compiler_get_temp_reg (compiler);
289 orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(3,2,3,2), src, tmp);
291 orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(3,2,3,2), src, tmp);
294 if (compiler->vars[i].size == 2) {
295 orc_sse_emit_660f (compiler, "paddw", 0xfd, tmp, src);
297 orc_sse_emit_660f (compiler, "paddd", 0xfe, tmp, src);
301 orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp);
303 if (compiler->vars[i].size == 2) {
304 orc_sse_emit_660f (compiler, "paddw", 0xfd, tmp, src);
306 orc_sse_emit_660f (compiler, "paddd", 0xfe, tmp, src);
310 if (compiler->vars[i].size == 2) {
312 orc_sse_emit_pshuflw (compiler, ORC_SSE_SHUF(1,1,1,1), src, tmp);
314 orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,1,1,1), src, tmp);
317 orc_sse_emit_660f (compiler, "paddw", 0xfd, tmp, src);
320 if (compiler->vars[i].size == 2) {
321 orc_x86_emit_mov_sse_reg (compiler, src, compiler->gp_tmpreg);
322 orc_x86_emit_and_imm_reg (compiler, 4, 0xffff, compiler->gp_tmpreg);
323 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
324 (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]),
327 orc_x86_emit_mov_sse_memoffset (compiler, 4, src,
328 (int)ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1]),
330 var->is_aligned, var->is_uncached);
341 sse_load_constant (OrcCompiler *compiler, int reg, int size, int value)
343 orc_sse_load_constant (compiler, reg, size, value);
347 orc_sse_load_constant (OrcCompiler *compiler, int reg, int size, orc_uint64 value)
352 int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);
354 /* FIXME how ugly and slow! */
355 orc_x86_emit_mov_imm_reg (compiler, 4, value>>0,
356 compiler->gp_tmpreg);
357 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
358 offset + 0, compiler->exec_reg);
360 orc_x86_emit_mov_imm_reg (compiler, 4, value>>32,
361 compiler->gp_tmpreg);
362 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
363 offset + 4, compiler->exec_reg);
365 orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, compiler->exec_reg,
368 orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(1,0,1,0), reg, reg);
375 value |= (value << 8);
376 value |= (value << 16);
380 value |= (value << 16);
383 ORC_ASM_CODE(compiler, "# loading constant %d 0x%08x\n", (int)value, (int)value);
385 orc_sse_emit_pxor(compiler, reg, reg);
388 if (value == 0xffffffff) {
389 orc_sse_emit_pcmpeqb (compiler, reg, reg);
392 if (compiler->target_flags & ORC_TARGET_SSE_SSSE3) {
393 if (value == 0x01010101) {
394 orc_sse_emit_pcmpeqb (compiler, reg, reg);
395 orc_sse_emit_pabsb (compiler, reg, reg);
404 orc_sse_emit_pcmpeqb (compiler, reg, reg);
405 orc_sse_emit_pslld (compiler, i, reg);
410 orc_sse_emit_pcmpeqb (compiler, reg, reg);
411 orc_sse_emit_psrld (compiler, i, reg);
417 v = (0xffff & (0xffff<<i)) | (0xffff0000 & (0xffff0000<<i));
419 orc_sse_emit_pcmpeqb (compiler, reg, reg);
420 orc_sse_emit_psllw (compiler, i, reg);
423 v = (0xffff & (0xffff>>i)) | (0xffff0000 & (0xffff0000>>i));
425 orc_sse_emit_pcmpeqb (compiler, reg, reg);
426 orc_sse_emit_psrlw (compiler, i, reg);
431 orc_x86_emit_mov_imm_reg (compiler, 4, value, compiler->gp_tmpreg);
432 orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, reg);
434 orc_sse_emit_pshufd (compiler, ORC_SSE_SHUF(0,0,0,0), reg, reg);
436 orc_mmx_emit_pshufw (compiler, ORC_MMX_SHUF(1,0,1,0), reg, reg);
441 sse_load_constant_long (OrcCompiler *compiler, int reg,
442 OrcConstant *constant)
445 int offset = ORC_STRUCT_OFFSET(OrcExecutor,arrays[ORC_VAR_T1]);
447 /* FIXME this is slower than it could be */
449 ORC_ASM_CODE(compiler, "# loading constant %08x %08x %08x %08x\n",
450 constant->full_value[0], constant->full_value[1],
451 constant->full_value[2], constant->full_value[3]);
454 orc_x86_emit_mov_imm_reg (compiler, 4, constant->full_value[i],
455 compiler->gp_tmpreg);
456 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
457 offset + 4*i, compiler->exec_reg);
459 orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, compiler->exec_reg,
465 sse_load_constants_outer (OrcCompiler *compiler)
468 for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
469 if (compiler->vars[i].name == NULL) continue;
470 switch (compiler->vars[i].vartype) {
471 case ORC_VAR_TYPE_CONST:
473 case ORC_VAR_TYPE_PARAM:
475 case ORC_VAR_TYPE_SRC:
476 case ORC_VAR_TYPE_DEST:
478 case ORC_VAR_TYPE_ACCUMULATOR:
479 orc_sse_emit_pxor (compiler,
480 compiler->vars[i].alloc, compiler->vars[i].alloc);
482 case ORC_VAR_TYPE_TEMP:
485 ORC_COMPILER_ERROR(compiler,"bad vartype");
490 orc_sse_emit_invariants (compiler);
492 /* FIXME move to a better place */
493 for(i=0;i<compiler->n_constants;i++){
494 compiler->constants[i].alloc_reg =
495 orc_compiler_get_constant_reg (compiler);
498 for(i=0;i<compiler->n_constants;i++){
499 if (compiler->constants[i].alloc_reg) {
500 if (compiler->constants[i].is_long) {
501 sse_load_constant_long (compiler, compiler->constants[i].alloc_reg,
502 compiler->constants + i);
504 sse_load_constant (compiler, compiler->constants[i].alloc_reg,
505 4, compiler->constants[i].value);
511 for(i=0;i<compiler->n_insns;i++){
512 OrcInstruction *insn = compiler->insns + i;
513 OrcStaticOpcode *opcode = insn->opcode;
515 if (strcmp (opcode->name, "ldreslinb") == 0 ||
516 strcmp (opcode->name, "ldreslinl") == 0 ||
517 strcmp (opcode->name, "ldresnearb") == 0 ||
518 strcmp (opcode->name, "ldresnearl") == 0) {
519 if (compiler->vars[insn->src_args[1]].vartype == ORC_VAR_TYPE_PARAM) {
520 orc_x86_emit_mov_memoffset_reg (compiler, 4,
521 (int)ORC_STRUCT_OFFSET(OrcExecutor, params[insn->src_args[1]]),
523 compiler->vars[insn->src_args[0]].ptr_offset);
525 orc_x86_emit_mov_imm_reg (compiler, 4,
526 compiler->vars[insn->src_args[1]].value.i,
527 compiler->vars[insn->src_args[0]].ptr_offset);
535 sse_load_constants_inner (OrcCompiler *compiler)
538 for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
539 if (compiler->vars[i].name == NULL) continue;
540 switch (compiler->vars[i].vartype) {
541 case ORC_VAR_TYPE_CONST:
543 case ORC_VAR_TYPE_PARAM:
545 case ORC_VAR_TYPE_SRC:
546 case ORC_VAR_TYPE_DEST:
547 if (compiler->vars[i].ptr_register) {
548 orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4,
549 (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg,
550 compiler->vars[i].ptr_register);
553 case ORC_VAR_TYPE_ACCUMULATOR:
555 case ORC_VAR_TYPE_TEMP:
558 ORC_COMPILER_ERROR(compiler,"bad vartype");
565 sse_add_strides (OrcCompiler *compiler)
569 for(i=0;i<ORC_N_COMPILER_VARIABLES;i++){
570 if (compiler->vars[i].name == NULL) continue;
571 switch (compiler->vars[i].vartype) {
572 case ORC_VAR_TYPE_CONST:
574 case ORC_VAR_TYPE_PARAM:
576 case ORC_VAR_TYPE_SRC:
577 case ORC_VAR_TYPE_DEST:
578 orc_x86_emit_mov_memoffset_reg (compiler, 4,
579 (int)ORC_STRUCT_OFFSET(OrcExecutor, params[i]), compiler->exec_reg,
580 compiler->gp_tmpreg);
581 orc_x86_emit_add_reg_memoffset (compiler, compiler->is_64bit ? 8 : 4,
583 (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), compiler->exec_reg);
585 if (compiler->vars[i].ptr_register == 0) {
586 ORC_COMPILER_ERROR(compiler, "unimplemented: stride on mem pointer");
589 case ORC_VAR_TYPE_ACCUMULATOR:
591 case ORC_VAR_TYPE_TEMP:
594 ORC_COMPILER_ERROR(compiler,"bad vartype");
601 get_align_var (OrcCompiler *compiler)
604 for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
605 if (compiler->vars[i].size == 0) continue;
606 if ((compiler->vars[i].size << compiler->loop_shift) >= 16) {
610 for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
611 if (compiler->vars[i].size == 0) continue;
612 if ((compiler->vars[i].size << compiler->loop_shift) >= 8) {
616 for(i=ORC_VAR_D1;i<=ORC_VAR_S8;i++){
617 if (compiler->vars[i].size == 0) continue;
621 ORC_COMPILER_ERROR(compiler, "could not find alignment variable");
639 ORC_ERROR("bad size %d", size);
646 orc_emit_split_3_regions (OrcCompiler *compiler)
652 align_var = get_align_var (compiler);
653 var_size_shift = get_shift (compiler->vars[align_var].size);
654 align_shift = var_size_shift + compiler->loop_shift;
656 /* determine how many iterations until align array is aligned (n1) */
657 orc_x86_emit_mov_imm_reg (compiler, 4, 16, X86_EAX);
658 orc_x86_emit_sub_memoffset_reg (compiler, 4,
659 (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[align_var]),
660 compiler->exec_reg, X86_EAX);
661 orc_x86_emit_and_imm_reg (compiler, 4, (1<<align_shift) - 1, X86_EAX);
662 orc_x86_emit_sar_imm_reg (compiler, 4, var_size_shift, X86_EAX);
664 /* check if n1 is greater than n. */
665 orc_x86_emit_cmp_reg_memoffset (compiler, 4, X86_EAX,
666 (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg);
668 orc_x86_emit_jle (compiler, 6);
670 /* If so, we have a standard 3-region split. */
671 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
672 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
675 orc_x86_emit_mov_memoffset_reg (compiler, 4,
676 (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
677 compiler->gp_tmpreg);
678 orc_x86_emit_sub_reg_reg (compiler, 4, X86_EAX, compiler->gp_tmpreg);
680 orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
682 orc_x86_emit_sar_imm_reg (compiler, 4,
683 compiler->loop_shift + compiler->unroll_shift,
684 compiler->gp_tmpreg);
685 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
686 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
689 orc_x86_emit_and_imm_reg (compiler, 4,
690 (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
691 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
692 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
694 orc_x86_emit_jmp (compiler, 7);
696 /* else, iterations are all unaligned: n1=n, n2=0, n3=0 */
697 orc_x86_emit_label (compiler, 6);
699 orc_x86_emit_mov_memoffset_reg (compiler, 4,
700 (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, X86_EAX);
701 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
702 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
703 orc_x86_emit_mov_imm_reg (compiler, 4, 0, X86_EAX);
704 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
705 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
706 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
707 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
709 orc_x86_emit_label (compiler, 7);
713 orc_emit_split_2_regions (OrcCompiler *compiler)
719 align_var = get_align_var (compiler);
720 var_size_shift = get_shift (compiler->vars[align_var].size);
721 align_shift = var_size_shift + compiler->loop_shift;
724 orc_x86_emit_mov_memoffset_reg (compiler, 4,
725 (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
726 compiler->gp_tmpreg);
727 orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX);
728 orc_x86_emit_sar_imm_reg (compiler, 4,
729 compiler->loop_shift + compiler->unroll_shift,
730 compiler->gp_tmpreg);
731 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
732 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
735 orc_x86_emit_and_imm_reg (compiler, 4,
736 (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX);
737 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
738 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
743 orc_program_has_float (OrcCompiler *compiler)
746 for(j=0;j<compiler->n_insns;j++){
747 OrcInstruction *insn = compiler->insns + j;
748 OrcStaticOpcode *opcode = insn->opcode;
749 if (opcode->flags & ORC_STATIC_OPCODE_FLOAT) return TRUE;
755 #define LABEL_REGION1_SKIP 1
756 #define LABEL_INNER_LOOP_START 2
757 #define LABEL_REGION2_SKIP 3
758 #define LABEL_OUTER_LOOP 4
759 #define LABEL_OUTER_LOOP_SKIP 5
760 #define LABEL_STEP_DOWN(x) (8+(x))
761 #define LABEL_STEP_UP(x) (13+(x))
765 orc_compiler_sse_assemble (OrcCompiler *compiler)
768 int set_mxcsr = FALSE;
772 if (0 && orc_x86_assemble_copy_check (compiler)) {
773 /* The rep movs implementation isn't faster most of the time */
774 orc_x86_assemble_copy (compiler);
778 align_var = get_align_var (compiler);
780 compiler->vars[align_var].is_aligned = FALSE;
783 orc_sse_emit_loop (compiler, 0, 0);
785 compiler->codeptr = compiler->code;
786 free (compiler->asm_code);
787 compiler->asm_code = NULL;
788 compiler->asm_code_len = 0;
789 memset (compiler->labels, 0, sizeof (compiler->labels));
790 compiler->n_fixups = 0;
793 if (compiler->error) return;
795 orc_x86_emit_prologue (compiler);
798 if (orc_program_has_float (compiler)) {
800 orc_sse_set_mxcsr (compiler);
804 sse_load_constants_outer (compiler);
806 if (compiler->program->is_2d) {
807 if (compiler->program->constant_m > 0) {
808 orc_x86_emit_mov_imm_reg (compiler, 4, compiler->program->constant_m,
810 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
811 (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
814 orc_x86_emit_mov_memoffset_reg (compiler, 4,
815 (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]),
816 compiler->exec_reg, X86_EAX);
817 orc_x86_emit_test_reg_reg (compiler, 4, X86_EAX, X86_EAX);
818 orc_x86_emit_jle (compiler, LABEL_OUTER_LOOP_SKIP);
819 orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX,
820 (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]),
824 orc_x86_emit_label (compiler, LABEL_OUTER_LOOP);
827 if (compiler->program->constant_n > 0 &&
828 compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
829 /* don't need to load n */
830 } else if (compiler->loop_shift > 0) {
831 if (!compiler->has_iterator_opcode) {
832 /* split n into three regions, with center region being aligned */
833 orc_emit_split_3_regions (compiler);
835 orc_emit_split_2_regions (compiler);
838 /* loop shift is 0, no need to split */
839 orc_x86_emit_mov_memoffset_reg (compiler, 4,
840 (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg,
841 compiler->gp_tmpreg);
842 orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg,
843 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
846 sse_load_constants_inner (compiler);
848 if (compiler->program->constant_n > 0 &&
849 compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) {
850 int n_left = compiler->program->constant_n;
854 compiler->offset = 0;
856 save_loop_shift = compiler->loop_shift;
857 while (n_left >= (1<<compiler->loop_shift)) {
858 ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
859 orc_sse_emit_loop (compiler, compiler->offset, 0);
861 n_left -= 1<<compiler->loop_shift;
862 compiler->offset += 1<<compiler->loop_shift;
864 for(loop_shift = compiler->loop_shift-1; loop_shift>=0; loop_shift--) {
865 if (n_left >= (1<<loop_shift)) {
866 compiler->loop_shift = loop_shift;
867 ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", loop_shift);
868 orc_sse_emit_loop (compiler, compiler->offset, 0);
869 n_left -= 1<<loop_shift;
870 compiler->offset += 1<<loop_shift;
873 compiler->loop_shift = save_loop_shift;
877 int emit_region1 = TRUE;
878 int emit_region3 = TRUE;
880 if (compiler->has_iterator_opcode) {
881 emit_region1 = FALSE;
883 if (compiler->loop_shift == 0) {
884 emit_region1 = FALSE;
885 emit_region3 = FALSE;
892 save_loop_shift = compiler->loop_shift;
893 compiler->vars[align_var].is_aligned = FALSE;
895 for (l=0;l<save_loop_shift;l++){
896 compiler->loop_shift = l;
897 ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
899 orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
900 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1), compiler->exec_reg);
901 orc_x86_emit_je (compiler, LABEL_STEP_UP(compiler->loop_shift));
902 orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift);
903 orc_x86_emit_label (compiler, LABEL_STEP_UP(compiler->loop_shift));
906 compiler->loop_shift = save_loop_shift;
907 compiler->vars[align_var].is_aligned = TRUE;
910 orc_x86_emit_label (compiler, LABEL_REGION1_SKIP);
912 orc_x86_emit_cmp_imm_memoffset (compiler, 4, 0,
913 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg);
914 orc_x86_emit_je (compiler, LABEL_REGION2_SKIP);
916 if (compiler->loop_counter != ORC_REG_INVALID) {
917 orc_x86_emit_mov_memoffset_reg (compiler, 4,
918 (int)ORC_STRUCT_OFFSET(OrcExecutor, counter2), compiler->exec_reg,
919 compiler->loop_counter);
922 ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
923 orc_x86_emit_align (compiler);
924 orc_x86_emit_label (compiler, LABEL_INNER_LOOP_START);
925 ui_max = 1<<compiler->unroll_shift;
926 for(ui=0;ui<ui_max;ui++) {
927 compiler->offset = ui<<compiler->loop_shift;
928 orc_sse_emit_loop (compiler, compiler->offset,
929 (ui==ui_max-1) << (compiler->loop_shift + compiler->unroll_shift));
931 compiler->offset = 0;
932 if (compiler->loop_counter != ORC_REG_INVALID) {
933 orc_x86_emit_add_imm_reg (compiler, 4, -1, compiler->loop_counter, TRUE);
935 orc_x86_emit_dec_memoffset (compiler, 4,
936 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2),
939 orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START);
940 orc_x86_emit_label (compiler, LABEL_REGION2_SKIP);
946 save_loop_shift = compiler->loop_shift + compiler->unroll_shift;
947 compiler->vars[align_var].is_aligned = FALSE;
949 for(l=save_loop_shift - 1; l >= 0; l--) {
950 compiler->loop_shift = l;
951 ORC_ASM_CODE(compiler, "# LOOP SHIFT %d\n", compiler->loop_shift);
953 orc_x86_emit_test_imm_memoffset (compiler, 4, 1<<compiler->loop_shift,
954 (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg);
955 orc_x86_emit_je (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
956 orc_sse_emit_loop (compiler, 0, 1<<compiler->loop_shift);
957 orc_x86_emit_label (compiler, LABEL_STEP_DOWN(compiler->loop_shift));
960 compiler->loop_shift = save_loop_shift;
964 if (compiler->program->is_2d && compiler->program->constant_m != 1) {
965 sse_add_strides (compiler);
967 orc_x86_emit_add_imm_memoffset (compiler, 4, -1,
968 (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]),
970 orc_x86_emit_jne (compiler, LABEL_OUTER_LOOP);
971 orc_x86_emit_label (compiler, LABEL_OUTER_LOOP_SKIP);
974 sse_save_accumulators (compiler);
978 orc_sse_restore_mxcsr (compiler);
981 orc_x86_emit_emms (compiler);
983 orc_x86_emit_epilogue (compiler);
985 orc_x86_do_fixups (compiler);
989 orc_sse_emit_loop (OrcCompiler *compiler, int offset, int update)
993 OrcInstruction *insn;
994 OrcStaticOpcode *opcode;
997 for(j=0;j<compiler->n_insns;j++){
998 insn = compiler->insns + j;
999 opcode = insn->opcode;
1001 compiler->insn_index = j;
1003 if (insn->flags & ORC_INSN_FLAG_INVARIANT) continue;
1005 ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name);
1007 compiler->min_temp_reg = ORC_VEC_REG_BASE;
1009 compiler->insn_shift = compiler->loop_shift;
1010 if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
1011 compiler->insn_shift += 1;
1013 if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
1014 compiler->insn_shift += 2;
1018 if (rule && rule->emit) {
1019 if (!(insn->opcode->flags & (ORC_STATIC_OPCODE_ACCUMULATOR|ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_STORE)) &&
1020 compiler->vars[insn->dest_args[0]].alloc !=
1021 compiler->vars[insn->src_args[0]].alloc) {
1022 orc_x86_emit_mov_sse_reg_reg (compiler,
1023 compiler->vars[insn->src_args[0]].alloc,
1024 compiler->vars[insn->dest_args[0]].alloc);
1026 rule->emit (compiler, rule->emit_user, insn);
1028 ORC_COMPILER_ERROR(compiler,"No rule for: %s", opcode->name);
1033 for(k=0;k<ORC_N_COMPILER_VARIABLES;k++){
1034 OrcVariable *var = compiler->vars + k;
1036 if (var->name == NULL) continue;
1037 if (var->vartype == ORC_VAR_TYPE_SRC ||
1038 var->vartype == ORC_VAR_TYPE_DEST) {
1040 if (var->update_type == 0) {
1042 } else if (var->update_type == 1) {
1043 offset = (var->size * update) >> 1;
1045 offset = var->size * update;
1049 if (compiler->vars[k].ptr_register) {
1050 orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4,
1052 compiler->vars[k].ptr_register, FALSE);
1054 orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4,
1056 (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[k]),
1057 compiler->exec_reg);
1066 orc_sse_emit_invariants (OrcCompiler *compiler)
1069 OrcInstruction *insn;
1070 OrcStaticOpcode *opcode;
1073 for(j=0;j<compiler->n_insns;j++){
1074 insn = compiler->insns + j;
1075 opcode = insn->opcode;
1077 if (!(insn->flags & ORC_INSN_FLAG_INVARIANT)) continue;
1079 ORC_ASM_CODE(compiler,"# %d: %s\n", j, insn->opcode->name);
1081 compiler->insn_shift = compiler->loop_shift;
1082 if (insn->flags & ORC_INSTRUCTION_FLAG_X2) {
1083 compiler->insn_shift += 1;
1085 if (insn->flags & ORC_INSTRUCTION_FLAG_X4) {
1086 compiler->insn_shift += 2;
1090 if (rule && rule->emit) {
1091 rule->emit (compiler, rule->emit_user, insn);
1093 ORC_COMPILER_ERROR(compiler,"No rule for: %s", opcode->name);