1 |
.rdata |
2 |
.asciiz "mips3.s, Version 1.1" |
3 |
.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" |
4 |
|
5 |
/* |
6 |
* ==================================================================== |
7 |
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
8 |
* project. |
9 |
* |
10 |
* Rights for redistribution and usage in source and binary forms are |
11 |
* granted according to the OpenSSL license. Warranty of any kind is |
12 |
* disclaimed. |
13 |
* ==================================================================== |
14 |
*/ |
15 |
|
16 |
/* |
17 |
* This is my modest contributon to the OpenSSL project (see |
18 |
* http://www.openssl.org/ for more information about it) and is |
19 |
* a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c |
20 |
* module. For updates see http://fy.chalmers.se/~appro/hpe/. |
21 |
* |
22 |
* The module is designed to work with either of the "new" MIPS ABI(5), |
23 |
* namely N32 or N64, offered by IRIX 6.x. It's not ment to work under |
24 |
* IRIX 5.x not only because it doesn't support new ABIs but also |
25 |
* because 5.x kernels put R4x00 CPU into 32-bit mode and all those |
26 |
* 64-bit instructions (daddu, dmultu, etc.) found below gonna only |
27 |
* cause illegal instruction exception:-( |
28 |
* |
29 |
* In addition the code depends on preprocessor flags set up by MIPSpro |
30 |
* compiler driver (either as or cc) and therefore (probably?) can't be |
31 |
* compiled by the GNU assembler. GNU C driver manages fine though... |
32 |
* I mean as long as -mmips-as is specified or is the default option, |
33 |
* because then it simply invokes /usr/bin/as which in turn takes |
34 |
* perfect care of the preprocessor definitions. Another neat feature |
35 |
* offered by the MIPSpro assembler is an optimization pass. This gave |
36 |
* me the opportunity to have the code looking more regular as all those |
37 |
* architecture dependent instruction rescheduling details were left to |
38 |
* the assembler. Cool, huh? |
39 |
* |
40 |
* Performance improvement is astonishing! 'apps/openssl speed rsa dsa' |
41 |
* goes way over 3 times faster! |
42 |
* |
43 |
* <appro@fy.chalmers.se> |
44 |
*/ |
45 |
#include <asm.h> |
46 |
#include <regdef.h> |
47 |
|
48 |
#if _MIPS_ISA>=4 |
49 |
#define MOVNZ(cond,dst,src) \ |
50 |
movn dst,src,cond |
51 |
#else |
52 |
#define MOVNZ(cond,dst,src) \ |
53 |
.set noreorder; \ |
54 |
bnezl cond,.+8; \ |
55 |
move dst,src; \ |
56 |
.set reorder |
57 |
#endif |
58 |
|
59 |
.text |
60 |
|
61 |
.set noat |
62 |
.set reorder |
63 |
|
64 |
#define MINUS4 v1 |
65 |
|
66 |
.align 5 |
67 |
LEAF(bn_mul_add_words) |
68 |
.set noreorder |
69 |
bgtzl a2,.L_bn_mul_add_words_proceed |
70 |
ld t0,0(a1) |
71 |
jr ra |
72 |
move v0,zero |
73 |
.set reorder |
74 |
|
75 |
.L_bn_mul_add_words_proceed: |
76 |
li MINUS4,-4 |
77 |
and ta0,a2,MINUS4 |
78 |
move v0,zero |
79 |
beqz ta0,.L_bn_mul_add_words_tail |
80 |
|
81 |
.L_bn_mul_add_words_loop: |
82 |
dmultu t0,a3 |
83 |
ld t1,0(a0) |
84 |
ld t2,8(a1) |
85 |
ld t3,8(a0) |
86 |
ld ta0,16(a1) |
87 |
ld ta1,16(a0) |
88 |
daddu t1,v0 |
89 |
sltu v0,t1,v0 /* All manuals say it "compares 32-bit |
90 |
* values", but it seems to work fine |
91 |
* even on 64-bit registers. */ |
92 |
mflo AT |
93 |
mfhi t0 |
94 |
daddu t1,AT |
95 |
daddu v0,t0 |
96 |
sltu AT,t1,AT |
97 |
sd t1,0(a0) |
98 |
daddu v0,AT |
99 |
|
100 |
dmultu t2,a3 |
101 |
ld ta2,24(a1) |
102 |
ld ta3,24(a0) |
103 |
daddu t3,v0 |
104 |
sltu v0,t3,v0 |
105 |
mflo AT |
106 |
mfhi t2 |
107 |
daddu t3,AT |
108 |
daddu v0,t2 |
109 |
sltu AT,t3,AT |
110 |
sd t3,8(a0) |
111 |
daddu v0,AT |
112 |
|
113 |
dmultu ta0,a3 |
114 |
subu a2,4 |
115 |
PTR_ADD a0,32 |
116 |
PTR_ADD a1,32 |
117 |
daddu ta1,v0 |
118 |
sltu v0,ta1,v0 |
119 |
mflo AT |
120 |
mfhi ta0 |
121 |
daddu ta1,AT |
122 |
daddu v0,ta0 |
123 |
sltu AT,ta1,AT |
124 |
sd ta1,-16(a0) |
125 |
daddu v0,AT |
126 |
|
127 |
|
128 |
dmultu ta2,a3 |
129 |
and ta0,a2,MINUS4 |
130 |
daddu ta3,v0 |
131 |
sltu v0,ta3,v0 |
132 |
mflo AT |
133 |
mfhi ta2 |
134 |
daddu ta3,AT |
135 |
daddu v0,ta2 |
136 |
sltu AT,ta3,AT |
137 |
sd ta3,-8(a0) |
138 |
daddu v0,AT |
139 |
.set noreorder |
140 |
bgtzl ta0,.L_bn_mul_add_words_loop |
141 |
ld t0,0(a1) |
142 |
|
143 |
bnezl a2,.L_bn_mul_add_words_tail |
144 |
ld t0,0(a1) |
145 |
.set reorder |
146 |
|
147 |
.L_bn_mul_add_words_return: |
148 |
jr ra |
149 |
|
150 |
.L_bn_mul_add_words_tail: |
151 |
dmultu t0,a3 |
152 |
ld t1,0(a0) |
153 |
subu a2,1 |
154 |
daddu t1,v0 |
155 |
sltu v0,t1,v0 |
156 |
mflo AT |
157 |
mfhi t0 |
158 |
daddu t1,AT |
159 |
daddu v0,t0 |
160 |
sltu AT,t1,AT |
161 |
sd t1,0(a0) |
162 |
daddu v0,AT |
163 |
beqz a2,.L_bn_mul_add_words_return |
164 |
|
165 |
ld t0,8(a1) |
166 |
dmultu t0,a3 |
167 |
ld t1,8(a0) |
168 |
subu a2,1 |
169 |
daddu t1,v0 |
170 |
sltu v0,t1,v0 |
171 |
mflo AT |
172 |
mfhi t0 |
173 |
daddu t1,AT |
174 |
daddu v0,t0 |
175 |
sltu AT,t1,AT |
176 |
sd t1,8(a0) |
177 |
daddu v0,AT |
178 |
beqz a2,.L_bn_mul_add_words_return |
179 |
|
180 |
ld t0,16(a1) |
181 |
dmultu t0,a3 |
182 |
ld t1,16(a0) |
183 |
daddu t1,v0 |
184 |
sltu v0,t1,v0 |
185 |
mflo AT |
186 |
mfhi t0 |
187 |
daddu t1,AT |
188 |
daddu v0,t0 |
189 |
sltu AT,t1,AT |
190 |
sd t1,16(a0) |
191 |
daddu v0,AT |
192 |
jr ra |
193 |
END(bn_mul_add_words) |
194 |
|
195 |
.align 5 |
196 |
LEAF(bn_mul_words) |
197 |
.set noreorder |
198 |
bgtzl a2,.L_bn_mul_words_proceed |
199 |
ld t0,0(a1) |
200 |
jr ra |
201 |
move v0,zero |
202 |
.set reorder |
203 |
|
204 |
.L_bn_mul_words_proceed: |
205 |
li MINUS4,-4 |
206 |
and ta0,a2,MINUS4 |
207 |
move v0,zero |
208 |
beqz ta0,.L_bn_mul_words_tail |
209 |
|
210 |
.L_bn_mul_words_loop: |
211 |
dmultu t0,a3 |
212 |
ld t2,8(a1) |
213 |
ld ta0,16(a1) |
214 |
ld ta2,24(a1) |
215 |
mflo AT |
216 |
mfhi t0 |
217 |
daddu v0,AT |
218 |
sltu t1,v0,AT |
219 |
sd v0,0(a0) |
220 |
daddu v0,t1,t0 |
221 |
|
222 |
dmultu t2,a3 |
223 |
subu a2,4 |
224 |
PTR_ADD a0,32 |
225 |
PTR_ADD a1,32 |
226 |
mflo AT |
227 |
mfhi t2 |
228 |
daddu v0,AT |
229 |
sltu t3,v0,AT |
230 |
sd v0,-24(a0) |
231 |
daddu v0,t3,t2 |
232 |
|
233 |
dmultu ta0,a3 |
234 |
mflo AT |
235 |
mfhi ta0 |
236 |
daddu v0,AT |
237 |
sltu ta1,v0,AT |
238 |
sd v0,-16(a0) |
239 |
daddu v0,ta1,ta0 |
240 |
|
241 |
|
242 |
dmultu ta2,a3 |
243 |
and ta0,a2,MINUS4 |
244 |
mflo AT |
245 |
mfhi ta2 |
246 |
daddu v0,AT |
247 |
sltu ta3,v0,AT |
248 |
sd v0,-8(a0) |
249 |
daddu v0,ta3,ta2 |
250 |
.set noreorder |
251 |
bgtzl ta0,.L_bn_mul_words_loop |
252 |
ld t0,0(a1) |
253 |
|
254 |
bnezl a2,.L_bn_mul_words_tail |
255 |
ld t0,0(a1) |
256 |
.set reorder |
257 |
|
258 |
.L_bn_mul_words_return: |
259 |
jr ra |
260 |
|
261 |
.L_bn_mul_words_tail: |
262 |
dmultu t0,a3 |
263 |
subu a2,1 |
264 |
mflo AT |
265 |
mfhi t0 |
266 |
daddu v0,AT |
267 |
sltu t1,v0,AT |
268 |
sd v0,0(a0) |
269 |
daddu v0,t1,t0 |
270 |
beqz a2,.L_bn_mul_words_return |
271 |
|
272 |
ld t0,8(a1) |
273 |
dmultu t0,a3 |
274 |
subu a2,1 |
275 |
mflo AT |
276 |
mfhi t0 |
277 |
daddu v0,AT |
278 |
sltu t1,v0,AT |
279 |
sd v0,8(a0) |
280 |
daddu v0,t1,t0 |
281 |
beqz a2,.L_bn_mul_words_return |
282 |
|
283 |
ld t0,16(a1) |
284 |
dmultu t0,a3 |
285 |
mflo AT |
286 |
mfhi t0 |
287 |
daddu v0,AT |
288 |
sltu t1,v0,AT |
289 |
sd v0,16(a0) |
290 |
daddu v0,t1,t0 |
291 |
jr ra |
292 |
END(bn_mul_words) |
293 |
|
294 |
.align 5 |
295 |
LEAF(bn_sqr_words) |
296 |
.set noreorder |
297 |
bgtzl a2,.L_bn_sqr_words_proceed |
298 |
ld t0,0(a1) |
299 |
jr ra |
300 |
move v0,zero |
301 |
.set reorder |
302 |
|
303 |
.L_bn_sqr_words_proceed: |
304 |
li MINUS4,-4 |
305 |
and ta0,a2,MINUS4 |
306 |
move v0,zero |
307 |
beqz ta0,.L_bn_sqr_words_tail |
308 |
|
309 |
.L_bn_sqr_words_loop: |
310 |
dmultu t0,t0 |
311 |
ld t2,8(a1) |
312 |
ld ta0,16(a1) |
313 |
ld ta2,24(a1) |
314 |
mflo t1 |
315 |
mfhi t0 |
316 |
sd t1,0(a0) |
317 |
sd t0,8(a0) |
318 |
|
319 |
dmultu t2,t2 |
320 |
subu a2,4 |
321 |
PTR_ADD a0,64 |
322 |
PTR_ADD a1,32 |
323 |
mflo t3 |
324 |
mfhi t2 |
325 |
sd t3,-48(a0) |
326 |
sd t2,-40(a0) |
327 |
|
328 |
dmultu ta0,ta0 |
329 |
mflo ta1 |
330 |
mfhi ta0 |
331 |
sd ta1,-32(a0) |
332 |
sd ta0,-24(a0) |
333 |
|
334 |
|
335 |
dmultu ta2,ta2 |
336 |
and ta0,a2,MINUS4 |
337 |
mflo ta3 |
338 |
mfhi ta2 |
339 |
sd ta3,-16(a0) |
340 |
sd ta2,-8(a0) |
341 |
|
342 |
.set noreorder |
343 |
bgtzl ta0,.L_bn_sqr_words_loop |
344 |
ld t0,0(a1) |
345 |
|
346 |
bnezl a2,.L_bn_sqr_words_tail |
347 |
ld t0,0(a1) |
348 |
.set reorder |
349 |
|
350 |
.L_bn_sqr_words_return: |
351 |
move v0,zero |
352 |
jr ra |
353 |
|
354 |
.L_bn_sqr_words_tail: |
355 |
dmultu t0,t0 |
356 |
subu a2,1 |
357 |
mflo t1 |
358 |
mfhi t0 |
359 |
sd t1,0(a0) |
360 |
sd t0,8(a0) |
361 |
beqz a2,.L_bn_sqr_words_return |
362 |
|
363 |
ld t0,8(a1) |
364 |
dmultu t0,t0 |
365 |
subu a2,1 |
366 |
mflo t1 |
367 |
mfhi t0 |
368 |
sd t1,16(a0) |
369 |
sd t0,24(a0) |
370 |
beqz a2,.L_bn_sqr_words_return |
371 |
|
372 |
ld t0,16(a1) |
373 |
dmultu t0,t0 |
374 |
mflo t1 |
375 |
mfhi t0 |
376 |
sd t1,32(a0) |
377 |
sd t0,40(a0) |
378 |
jr ra |
379 |
END(bn_sqr_words) |
380 |
|
381 |
.align 5 |
382 |
LEAF(bn_add_words) |
383 |
.set noreorder |
384 |
bgtzl a3,.L_bn_add_words_proceed |
385 |
ld t0,0(a1) |
386 |
jr ra |
387 |
move v0,zero |
388 |
.set reorder |
389 |
|
390 |
.L_bn_add_words_proceed: |
391 |
li MINUS4,-4 |
392 |
and AT,a3,MINUS4 |
393 |
move v0,zero |
394 |
beqz AT,.L_bn_add_words_tail |
395 |
|
396 |
.L_bn_add_words_loop: |
397 |
ld ta0,0(a2) |
398 |
subu a3,4 |
399 |
ld t1,8(a1) |
400 |
and AT,a3,MINUS4 |
401 |
ld t2,16(a1) |
402 |
PTR_ADD a2,32 |
403 |
ld t3,24(a1) |
404 |
PTR_ADD a0,32 |
405 |
ld ta1,-24(a2) |
406 |
PTR_ADD a1,32 |
407 |
ld ta2,-16(a2) |
408 |
ld ta3,-8(a2) |
409 |
daddu ta0,t0 |
410 |
sltu t8,ta0,t0 |
411 |
daddu t0,ta0,v0 |
412 |
sltu v0,t0,ta0 |
413 |
sd t0,-32(a0) |
414 |
daddu v0,t8 |
415 |
|
416 |
daddu ta1,t1 |
417 |
sltu t9,ta1,t1 |
418 |
daddu t1,ta1,v0 |
419 |
sltu v0,t1,ta1 |
420 |
sd t1,-24(a0) |
421 |
daddu v0,t9 |
422 |
|
423 |
daddu ta2,t2 |
424 |
sltu t8,ta2,t2 |
425 |
daddu t2,ta2,v0 |
426 |
sltu v0,t2,ta2 |
427 |
sd t2,-16(a0) |
428 |
daddu v0,t8 |
429 |
|
430 |
daddu ta3,t3 |
431 |
sltu t9,ta3,t3 |
432 |
daddu t3,ta3,v0 |
433 |
sltu v0,t3,ta3 |
434 |
sd t3,-8(a0) |
435 |
daddu v0,t9 |
436 |
|
437 |
.set noreorder |
438 |
bgtzl AT,.L_bn_add_words_loop |
439 |
ld t0,0(a1) |
440 |
|
441 |
bnezl a3,.L_bn_add_words_tail |
442 |
ld t0,0(a1) |
443 |
.set reorder |
444 |
|
445 |
.L_bn_add_words_return: |
446 |
jr ra |
447 |
|
448 |
.L_bn_add_words_tail: |
449 |
ld ta0,0(a2) |
450 |
daddu ta0,t0 |
451 |
subu a3,1 |
452 |
sltu t8,ta0,t0 |
453 |
daddu t0,ta0,v0 |
454 |
sltu v0,t0,ta0 |
455 |
sd t0,0(a0) |
456 |
daddu v0,t8 |
457 |
beqz a3,.L_bn_add_words_return |
458 |
|
459 |
ld t1,8(a1) |
460 |
ld ta1,8(a2) |
461 |
daddu ta1,t1 |
462 |
subu a3,1 |
463 |
sltu t9,ta1,t1 |
464 |
daddu t1,ta1,v0 |
465 |
sltu v0,t1,ta1 |
466 |
sd t1,8(a0) |
467 |
daddu v0,t9 |
468 |
beqz a3,.L_bn_add_words_return |
469 |
|
470 |
ld t2,16(a1) |
471 |
ld ta2,16(a2) |
472 |
daddu ta2,t2 |
473 |
sltu t8,ta2,t2 |
474 |
daddu t2,ta2,v0 |
475 |
sltu v0,t2,ta2 |
476 |
sd t2,16(a0) |
477 |
daddu v0,t8 |
478 |
jr ra |
479 |
END(bn_add_words) |
480 |
|
481 |
.align 5 |
482 |
LEAF(bn_sub_words) |
483 |
.set noreorder |
484 |
bgtzl a3,.L_bn_sub_words_proceed |
485 |
ld t0,0(a1) |
486 |
jr ra |
487 |
move v0,zero |
488 |
.set reorder |
489 |
|
490 |
.L_bn_sub_words_proceed: |
491 |
li MINUS4,-4 |
492 |
and AT,a3,MINUS4 |
493 |
move v0,zero |
494 |
beqz AT,.L_bn_sub_words_tail |
495 |
|
496 |
.L_bn_sub_words_loop: |
497 |
ld ta0,0(a2) |
498 |
subu a3,4 |
499 |
ld t1,8(a1) |
500 |
and AT,a3,MINUS4 |
501 |
ld t2,16(a1) |
502 |
PTR_ADD a2,32 |
503 |
ld t3,24(a1) |
504 |
PTR_ADD a0,32 |
505 |
ld ta1,-24(a2) |
506 |
PTR_ADD a1,32 |
507 |
ld ta2,-16(a2) |
508 |
ld ta3,-8(a2) |
509 |
sltu t8,t0,ta0 |
510 |
dsubu t0,ta0 |
511 |
dsubu ta0,t0,v0 |
512 |
sd ta0,-32(a0) |
513 |
MOVNZ (t0,v0,t8) |
514 |
|
515 |
sltu t9,t1,ta1 |
516 |
dsubu t1,ta1 |
517 |
dsubu ta1,t1,v0 |
518 |
sd ta1,-24(a0) |
519 |
MOVNZ (t1,v0,t9) |
520 |
|
521 |
|
522 |
sltu t8,t2,ta2 |
523 |
dsubu t2,ta2 |
524 |
dsubu ta2,t2,v0 |
525 |
sd ta2,-16(a0) |
526 |
MOVNZ (t2,v0,t8) |
527 |
|
528 |
sltu t9,t3,ta3 |
529 |
dsubu t3,ta3 |
530 |
dsubu ta3,t3,v0 |
531 |
sd ta3,-8(a0) |
532 |
MOVNZ (t3,v0,t9) |
533 |
|
534 |
.set noreorder |
535 |
bgtzl AT,.L_bn_sub_words_loop |
536 |
ld t0,0(a1) |
537 |
|
538 |
bnezl a3,.L_bn_sub_words_tail |
539 |
ld t0,0(a1) |
540 |
.set reorder |
541 |
|
542 |
.L_bn_sub_words_return: |
543 |
jr ra |
544 |
|
545 |
.L_bn_sub_words_tail: |
546 |
ld ta0,0(a2) |
547 |
subu a3,1 |
548 |
sltu t8,t0,ta0 |
549 |
dsubu t0,ta0 |
550 |
dsubu ta0,t0,v0 |
551 |
MOVNZ (t0,v0,t8) |
552 |
sd ta0,0(a0) |
553 |
beqz a3,.L_bn_sub_words_return |
554 |
|
555 |
ld t1,8(a1) |
556 |
subu a3,1 |
557 |
ld ta1,8(a2) |
558 |
sltu t9,t1,ta1 |
559 |
dsubu t1,ta1 |
560 |
dsubu ta1,t1,v0 |
561 |
MOVNZ (t1,v0,t9) |
562 |
sd ta1,8(a0) |
563 |
beqz a3,.L_bn_sub_words_return |
564 |
|
565 |
ld t2,16(a1) |
566 |
ld ta2,16(a2) |
567 |
sltu t8,t2,ta2 |
568 |
dsubu t2,ta2 |
569 |
dsubu ta2,t2,v0 |
570 |
MOVNZ (t2,v0,t8) |
571 |
sd ta2,16(a0) |
572 |
jr ra |
573 |
END(bn_sub_words) |
574 |
|
575 |
#undef MINUS4 |
576 |
|
577 |
.align 5 |
578 |
LEAF(bn_div_3_words) |
579 |
.set reorder |
580 |
move a3,a0 /* we know that bn_div_words doesn't |
581 |
* touch a3, ta2, ta3 and preserves a2 |
582 |
* so that we can save two arguments |
583 |
* and return address in registers |
584 |
* instead of stack:-) |
585 |
*/ |
586 |
ld a0,(a3) |
587 |
move ta2,a1 |
588 |
ld a1,-8(a3) |
589 |
bne a0,a2,.L_bn_div_3_words_proceed |
590 |
li v0,-1 |
591 |
jr ra |
592 |
.L_bn_div_3_words_proceed: |
593 |
move ta3,ra |
594 |
bal bn_div_words |
595 |
move ra,ta3 |
596 |
dmultu ta2,v0 |
597 |
ld t2,-16(a3) |
598 |
move ta0,zero |
599 |
mfhi t1 |
600 |
mflo t0 |
601 |
sltu t8,t1,v1 |
602 |
.L_bn_div_3_words_inner_loop: |
603 |
bnez t8,.L_bn_div_3_words_inner_loop_done |
604 |
sgeu AT,t2,t0 |
605 |
seq t9,t1,v1 |
606 |
and AT,t9 |
607 |
sltu t3,t0,ta2 |
608 |
daddu v1,a2 |
609 |
dsubu t1,t3 |
610 |
dsubu t0,ta2 |
611 |
sltu t8,t1,v1 |
612 |
sltu ta0,v1,a2 |
613 |
or t8,ta0 |
614 |
.set noreorder |
615 |
beqzl AT,.L_bn_div_3_words_inner_loop |
616 |
dsubu v0,1 |
617 |
.set reorder |
618 |
.L_bn_div_3_words_inner_loop_done: |
619 |
jr ra |
620 |
END(bn_div_3_words) |
621 |
|
622 |
.align 5 |
623 |
LEAF(bn_div_words) |
624 |
.set noreorder |
625 |
bnezl a2,.L_bn_div_words_proceed |
626 |
move v1,zero |
627 |
jr ra |
628 |
li v0,-1 /* I'd rather signal div-by-zero |
629 |
* which can be done with 'break 7' */ |
630 |
|
631 |
.L_bn_div_words_proceed: |
632 |
bltz a2,.L_bn_div_words_body |
633 |
move t9,v1 |
634 |
dsll a2,1 |
635 |
bgtz a2,.-4 |
636 |
addu t9,1 |
637 |
|
638 |
.set reorder |
639 |
negu t1,t9 |
640 |
li t2,-1 |
641 |
dsll t2,t1 |
642 |
and t2,a0 |
643 |
dsrl AT,a1,t1 |
644 |
.set noreorder |
645 |
bnezl t2,.+8 |
646 |
break 6 /* signal overflow */ |
647 |
.set reorder |
648 |
dsll a0,t9 |
649 |
dsll a1,t9 |
650 |
or a0,AT |
651 |
|
652 |
#define QT ta0 |
653 |
#define HH ta1 |
654 |
#define DH v1 |
655 |
.L_bn_div_words_body: |
656 |
dsrl DH,a2,32 |
657 |
sgeu AT,a0,a2 |
658 |
.set noreorder |
659 |
bnezl AT,.+8 |
660 |
dsubu a0,a2 |
661 |
.set reorder |
662 |
|
663 |
li QT,-1 |
664 |
dsrl HH,a0,32 |
665 |
dsrl QT,32 /* q=0xffffffff */ |
666 |
beq DH,HH,.L_bn_div_words_skip_div1 |
667 |
ddivu zero,a0,DH |
668 |
mflo QT |
669 |
.L_bn_div_words_skip_div1: |
670 |
dmultu a2,QT |
671 |
dsll t3,a0,32 |
672 |
dsrl AT,a1,32 |
673 |
or t3,AT |
674 |
mflo t0 |
675 |
mfhi t1 |
676 |
.L_bn_div_words_inner_loop1: |
677 |
sltu t2,t3,t0 |
678 |
seq t8,HH,t1 |
679 |
sltu AT,HH,t1 |
680 |
and t2,t8 |
681 |
sltu v0,t0,a2 |
682 |
or AT,t2 |
683 |
.set noreorder |
684 |
beqz AT,.L_bn_div_words_inner_loop1_done |
685 |
dsubu t1,v0 |
686 |
dsubu t0,a2 |
687 |
b .L_bn_div_words_inner_loop1 |
688 |
dsubu QT,1 |
689 |
.set reorder |
690 |
.L_bn_div_words_inner_loop1_done: |
691 |
|
692 |
dsll a1,32 |
693 |
dsubu a0,t3,t0 |
694 |
dsll v0,QT,32 |
695 |
|
696 |
li QT,-1 |
697 |
dsrl HH,a0,32 |
698 |
dsrl QT,32 /* q=0xffffffff */ |
699 |
beq DH,HH,.L_bn_div_words_skip_div2 |
700 |
ddivu zero,a0,DH |
701 |
mflo QT |
702 |
.L_bn_div_words_skip_div2: |
703 |
#undef DH |
704 |
dmultu a2,QT |
705 |
dsll t3,a0,32 |
706 |
dsrl AT,a1,32 |
707 |
or t3,AT |
708 |
mflo t0 |
709 |
mfhi t1 |
710 |
.L_bn_div_words_inner_loop2: |
711 |
sltu t2,t3,t0 |
712 |
seq t8,HH,t1 |
713 |
sltu AT,HH,t1 |
714 |
and t2,t8 |
715 |
sltu v1,t0,a2 |
716 |
or AT,t2 |
717 |
.set noreorder |
718 |
beqz AT,.L_bn_div_words_inner_loop2_done |
719 |
dsubu t1,v1 |
720 |
dsubu t0,a2 |
721 |
b .L_bn_div_words_inner_loop2 |
722 |
dsubu QT,1 |
723 |
.set reorder |
724 |
.L_bn_div_words_inner_loop2_done: |
725 |
#undef HH |
726 |
|
727 |
dsubu a0,t3,t0 |
728 |
or v0,QT |
729 |
dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ |
730 |
dsrl a2,t9 /* restore a2 */ |
731 |
jr ra |
732 |
#undef QT |
733 |
END(bn_div_words) |
734 |
|
735 |
#define a_0 t0 |
736 |
#define a_1 t1 |
737 |
#define a_2 t2 |
738 |
#define a_3 t3 |
739 |
#define b_0 ta0 |
740 |
#define b_1 ta1 |
741 |
#define b_2 ta2 |
742 |
#define b_3 ta3 |
743 |
|
744 |
#define a_4 s0 |
745 |
#define a_5 s2 |
746 |
#define a_6 s4 |
747 |
#define a_7 a1 /* once we load a[7] we don't need a anymore */ |
748 |
#define b_4 s1 |
749 |
#define b_5 s3 |
750 |
#define b_6 s5 |
751 |
#define b_7 a2 /* once we load b[7] we don't need b anymore */ |
752 |
|
753 |
#define t_1 t8 |
754 |
#define t_2 t9 |
755 |
|
756 |
#define c_1 v0 |
757 |
#define c_2 v1 |
758 |
#define c_3 a3 |
759 |
|
760 |
#define FRAME_SIZE 48 |
761 |
|
762 |
.align 5 |
763 |
LEAF(bn_mul_comba8) |
764 |
.set noreorder |
765 |
PTR_SUB sp,FRAME_SIZE |
766 |
.frame sp,64,ra |
767 |
.set reorder |
768 |
ld a_0,0(a1) /* If compiled with -mips3 option on |
769 |
* R5000 box assembler barks on this |
770 |
* line with "shouldn't have mult/div |
771 |
* as last instruction in bb (R10K |
772 |
* bug)" warning. If anybody out there |
773 |
* has a clue about how to circumvent |
774 |
* this do send me a note. |
775 |
* <appro@fy.chalmers.se> |
776 |
*/ |
777 |
ld b_0,0(a2) |
778 |
ld a_1,8(a1) |
779 |
ld a_2,16(a1) |
780 |
ld a_3,24(a1) |
781 |
ld b_1,8(a2) |
782 |
ld b_2,16(a2) |
783 |
ld b_3,24(a2) |
784 |
dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ |
785 |
sd s0,0(sp) |
786 |
sd s1,8(sp) |
787 |
sd s2,16(sp) |
788 |
sd s3,24(sp) |
789 |
sd s4,32(sp) |
790 |
sd s5,40(sp) |
791 |
mflo c_1 |
792 |
mfhi c_2 |
793 |
|
794 |
dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ |
795 |
ld a_4,32(a1) |
796 |
ld a_5,40(a1) |
797 |
ld a_6,48(a1) |
798 |
ld a_7,56(a1) |
799 |
ld b_4,32(a2) |
800 |
ld b_5,40(a2) |
801 |
mflo t_1 |
802 |
mfhi t_2 |
803 |
daddu c_2,t_1 |
804 |
sltu AT,c_2,t_1 |
805 |
daddu c_3,t_2,AT |
806 |
dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ |
807 |
ld b_6,48(a2) |
808 |
ld b_7,56(a2) |
809 |
sd c_1,0(a0) /* r[0]=c1; */ |
810 |
mflo t_1 |
811 |
mfhi t_2 |
812 |
daddu c_2,t_1 |
813 |
sltu AT,c_2,t_1 |
814 |
daddu t_2,AT |
815 |
daddu c_3,t_2 |
816 |
sltu c_1,c_3,t_2 |
817 |
sd c_2,8(a0) /* r[1]=c2; */ |
818 |
|
819 |
dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ |
820 |
mflo t_1 |
821 |
mfhi t_2 |
822 |
daddu c_3,t_1 |
823 |
sltu AT,c_3,t_1 |
824 |
daddu t_2,AT |
825 |
daddu c_1,t_2 |
826 |
dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ |
827 |
mflo t_1 |
828 |
mfhi t_2 |
829 |
daddu c_3,t_1 |
830 |
sltu AT,c_3,t_1 |
831 |
daddu t_2,AT |
832 |
daddu c_1,t_2 |
833 |
sltu c_2,c_1,t_2 |
834 |
dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ |
835 |
mflo t_1 |
836 |
mfhi t_2 |
837 |
daddu c_3,t_1 |
838 |
sltu AT,c_3,t_1 |
839 |
daddu t_2,AT |
840 |
daddu c_1,t_2 |
841 |
sltu AT,c_1,t_2 |
842 |
daddu c_2,AT |
843 |
sd c_3,16(a0) /* r[2]=c3; */ |
844 |
|
845 |
dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ |
846 |
mflo t_1 |
847 |
mfhi t_2 |
848 |
daddu c_1,t_1 |
849 |
sltu AT,c_1,t_1 |
850 |
daddu t_2,AT |
851 |
daddu c_2,t_2 |
852 |
sltu c_3,c_2,t_2 |
853 |
dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ |
854 |
mflo t_1 |
855 |
mfhi t_2 |
856 |
daddu c_1,t_1 |
857 |
sltu AT,c_1,t_1 |
858 |
daddu t_2,AT |
859 |
daddu c_2,t_2 |
860 |
sltu AT,c_2,t_2 |
861 |
daddu c_3,AT |
862 |
dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ |
863 |
mflo t_1 |
864 |
mfhi t_2 |
865 |
daddu c_1,t_1 |
866 |
sltu AT,c_1,t_1 |
867 |
daddu t_2,AT |
868 |
daddu c_2,t_2 |
869 |
sltu AT,c_2,t_2 |
870 |
daddu c_3,AT |
871 |
dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ |
872 |
mflo t_1 |
873 |
mfhi t_2 |
874 |
daddu c_1,t_1 |
875 |
sltu AT,c_1,t_1 |
876 |
daddu t_2,AT |
877 |
daddu c_2,t_2 |
878 |
sltu AT,c_2,t_2 |
879 |
daddu c_3,AT |
880 |
sd c_1,24(a0) /* r[3]=c1; */ |
881 |
|
882 |
dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ |
883 |
mflo t_1 |
884 |
mfhi t_2 |
885 |
daddu c_2,t_1 |
886 |
sltu AT,c_2,t_1 |
887 |
daddu t_2,AT |
888 |
daddu c_3,t_2 |
889 |
sltu c_1,c_3,t_2 |
890 |
dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ |
891 |
mflo t_1 |
892 |
mfhi t_2 |
893 |
daddu c_2,t_1 |
894 |
sltu AT,c_2,t_1 |
895 |
daddu t_2,AT |
896 |
daddu c_3,t_2 |
897 |
sltu AT,c_3,t_2 |
898 |
daddu c_1,AT |
899 |
dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ |
900 |
mflo t_1 |
901 |
mfhi t_2 |
902 |
daddu c_2,t_1 |
903 |
sltu AT,c_2,t_1 |
904 |
daddu t_2,AT |
905 |
daddu c_3,t_2 |
906 |
sltu AT,c_3,t_2 |
907 |
daddu c_1,AT |
908 |
dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ |
909 |
mflo t_1 |
910 |
mfhi t_2 |
911 |
daddu c_2,t_1 |
912 |
sltu AT,c_2,t_1 |
913 |
daddu t_2,AT |
914 |
daddu c_3,t_2 |
915 |
sltu AT,c_3,t_2 |
916 |
daddu c_1,AT |
917 |
dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ |
918 |
mflo t_1 |
919 |
mfhi t_2 |
920 |
daddu c_2,t_1 |
921 |
sltu AT,c_2,t_1 |
922 |
daddu t_2,AT |
923 |
daddu c_3,t_2 |
924 |
sltu AT,c_3,t_2 |
925 |
daddu c_1,AT |
926 |
sd c_2,32(a0) /* r[4]=c2; */ |
927 |
|
928 |
dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ |
929 |
mflo t_1 |
930 |
mfhi t_2 |
931 |
daddu c_3,t_1 |
932 |
sltu AT,c_3,t_1 |
933 |
daddu t_2,AT |
934 |
daddu c_1,t_2 |
935 |
sltu c_2,c_1,t_2 |
936 |
dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ |
937 |
mflo t_1 |
938 |
mfhi t_2 |
939 |
daddu c_3,t_1 |
940 |
sltu AT,c_3,t_1 |
941 |
daddu t_2,AT |
942 |
daddu c_1,t_2 |
943 |
sltu AT,c_1,t_2 |
944 |
daddu c_2,AT |
945 |
dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ |
946 |
mflo t_1 |
947 |
mfhi t_2 |
948 |
daddu c_3,t_1 |
949 |
sltu AT,c_3,t_1 |
950 |
daddu t_2,AT |
951 |
daddu c_1,t_2 |
952 |
sltu AT,c_1,t_2 |
953 |
daddu c_2,AT |
954 |
dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ |
955 |
mflo t_1 |
956 |
mfhi t_2 |
957 |
daddu c_3,t_1 |
958 |
sltu AT,c_3,t_1 |
959 |
daddu t_2,AT |
960 |
daddu c_1,t_2 |
961 |
sltu AT,c_1,t_2 |
962 |
daddu c_2,AT |
963 |
dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ |
964 |
mflo t_1 |
965 |
mfhi t_2 |
966 |
daddu c_3,t_1 |
967 |
sltu AT,c_3,t_1 |
968 |
daddu t_2,AT |
969 |
daddu c_1,t_2 |
970 |
sltu AT,c_1,t_2 |
971 |
daddu c_2,AT |
972 |
dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ |
973 |
mflo t_1 |
974 |
mfhi t_2 |
975 |
daddu c_3,t_1 |
976 |
sltu AT,c_3,t_1 |
977 |
daddu t_2,AT |
978 |
daddu c_1,t_2 |
979 |
sltu AT,c_1,t_2 |
980 |
daddu c_2,AT |
981 |
sd c_3,40(a0) /* r[5]=c3; */ |
982 |
|
983 |
dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ |
984 |
mflo t_1 |
985 |
mfhi t_2 |
986 |
daddu c_1,t_1 |
987 |
sltu AT,c_1,t_1 |
988 |
daddu t_2,AT |
989 |
daddu c_2,t_2 |
990 |
sltu c_3,c_2,t_2 |
991 |
dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ |
992 |
mflo t_1 |
993 |
mfhi t_2 |
994 |
daddu c_1,t_1 |
995 |
sltu AT,c_1,t_1 |
996 |
daddu t_2,AT |
997 |
daddu c_2,t_2 |
998 |
sltu AT,c_2,t_2 |
999 |
daddu c_3,AT |
1000 |
dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ |
1001 |
mflo t_1 |
1002 |
mfhi t_2 |
1003 |
daddu c_1,t_1 |
1004 |
sltu AT,c_1,t_1 |
1005 |
daddu t_2,AT |
1006 |
daddu c_2,t_2 |
1007 |
sltu AT,c_2,t_2 |
1008 |
daddu c_3,AT |
1009 |
dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ |
1010 |
mflo t_1 |
1011 |
mfhi t_2 |
1012 |
daddu c_1,t_1 |
1013 |
sltu AT,c_1,t_1 |
1014 |
daddu t_2,AT |
1015 |
daddu c_2,t_2 |
1016 |
sltu AT,c_2,t_2 |
1017 |
daddu c_3,AT |
1018 |
dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ |
1019 |
mflo t_1 |
1020 |
mfhi t_2 |
1021 |
daddu c_1,t_1 |
1022 |
sltu AT,c_1,t_1 |
1023 |
daddu t_2,AT |
1024 |
daddu c_2,t_2 |
1025 |
sltu AT,c_2,t_2 |
1026 |
daddu c_3,AT |
1027 |
dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ |
1028 |
mflo t_1 |
1029 |
mfhi t_2 |
1030 |
daddu c_1,t_1 |
1031 |
sltu AT,c_1,t_1 |
1032 |
daddu t_2,AT |
1033 |
daddu c_2,t_2 |
1034 |
sltu AT,c_2,t_2 |
1035 |
daddu c_3,AT |
1036 |
dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ |
1037 |
mflo t_1 |
1038 |
mfhi t_2 |
1039 |
daddu c_1,t_1 |
1040 |
sltu AT,c_1,t_1 |
1041 |
daddu t_2,AT |
1042 |
daddu c_2,t_2 |
1043 |
sltu AT,c_2,t_2 |
1044 |
daddu c_3,AT |
1045 |
sd c_1,48(a0) /* r[6]=c1; */ |
1046 |
|
1047 |
dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ |
1048 |
mflo t_1 |
1049 |
mfhi t_2 |
1050 |
daddu c_2,t_1 |
1051 |
sltu AT,c_2,t_1 |
1052 |
daddu t_2,AT |
1053 |
daddu c_3,t_2 |
1054 |
sltu c_1,c_3,t_2 |
1055 |
dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ |
1056 |
mflo t_1 |
1057 |
mfhi t_2 |
1058 |
daddu c_2,t_1 |
1059 |
sltu AT,c_2,t_1 |
1060 |
daddu t_2,AT |
1061 |
daddu c_3,t_2 |
1062 |
sltu AT,c_3,t_2 |
1063 |
daddu c_1,AT |
1064 |
dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ |
1065 |
mflo t_1 |
1066 |
mfhi t_2 |
1067 |
daddu c_2,t_1 |
1068 |
sltu AT,c_2,t_1 |
1069 |
daddu t_2,AT |
1070 |
daddu c_3,t_2 |
1071 |
sltu AT,c_3,t_2 |
1072 |
daddu c_1,AT |
1073 |
dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ |
1074 |
mflo t_1 |
1075 |
mfhi t_2 |
1076 |
daddu c_2,t_1 |
1077 |
sltu AT,c_2,t_1 |
1078 |
daddu t_2,AT |
1079 |
daddu c_3,t_2 |
1080 |
sltu AT,c_3,t_2 |
1081 |
daddu c_1,AT |
1082 |
dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ |
1083 |
mflo t_1 |
1084 |
mfhi t_2 |
1085 |
daddu c_2,t_1 |
1086 |
sltu AT,c_2,t_1 |
1087 |
daddu t_2,AT |
1088 |
daddu c_3,t_2 |
1089 |
sltu AT,c_3,t_2 |
1090 |
daddu c_1,AT |
1091 |
dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ |
1092 |
mflo t_1 |
1093 |
mfhi t_2 |
1094 |
daddu c_2,t_1 |
1095 |
sltu AT,c_2,t_1 |
1096 |
daddu t_2,AT |
1097 |
daddu c_3,t_2 |
1098 |
sltu AT,c_3,t_2 |
1099 |
daddu c_1,AT |
1100 |
dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ |
1101 |
mflo t_1 |
1102 |
mfhi t_2 |
1103 |
daddu c_2,t_1 |
1104 |
sltu AT,c_2,t_1 |
1105 |
daddu t_2,AT |
1106 |
daddu c_3,t_2 |
1107 |
sltu AT,c_3,t_2 |
1108 |
daddu c_1,AT |
1109 |
dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ |
1110 |
mflo t_1 |
1111 |
mfhi t_2 |
1112 |
daddu c_2,t_1 |
1113 |
sltu AT,c_2,t_1 |
1114 |
daddu t_2,AT |
1115 |
daddu c_3,t_2 |
1116 |
sltu AT,c_3,t_2 |
1117 |
daddu c_1,AT |
1118 |
sd c_2,56(a0) /* r[7]=c2; */ |
1119 |
|
1120 |
dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ |
1121 |
mflo t_1 |
1122 |
mfhi t_2 |
1123 |
daddu c_3,t_1 |
1124 |
sltu AT,c_3,t_1 |
1125 |
daddu t_2,AT |
1126 |
daddu c_1,t_2 |
1127 |
sltu c_2,c_1,t_2 |
1128 |
dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ |
1129 |
mflo t_1 |
1130 |
mfhi t_2 |
1131 |
daddu c_3,t_1 |
1132 |
sltu AT,c_3,t_1 |
1133 |
daddu t_2,AT |
1134 |
daddu c_1,t_2 |
1135 |
sltu AT,c_1,t_2 |
1136 |
daddu c_2,AT |
1137 |
dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ |
1138 |
mflo t_1 |
1139 |
mfhi t_2 |
1140 |
daddu c_3,t_1 |
1141 |
sltu AT,c_3,t_1 |
1142 |
daddu t_2,AT |
1143 |
daddu c_1,t_2 |
1144 |
sltu AT,c_1,t_2 |
1145 |
daddu c_2,AT |
1146 |
dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ |
1147 |
mflo t_1 |
1148 |
mfhi t_2 |
1149 |
daddu c_3,t_1 |
1150 |
sltu AT,c_3,t_1 |
1151 |
daddu t_2,AT |
1152 |
daddu c_1,t_2 |
1153 |
sltu AT,c_1,t_2 |
1154 |
daddu c_2,AT |
1155 |
dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ |
1156 |
mflo t_1 |
1157 |
mfhi t_2 |
1158 |
daddu c_3,t_1 |
1159 |
sltu AT,c_3,t_1 |
1160 |
daddu t_2,AT |
1161 |
daddu c_1,t_2 |
1162 |
sltu AT,c_1,t_2 |
1163 |
daddu c_2,AT |
1164 |
dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ |
1165 |
mflo t_1 |
1166 |
mfhi t_2 |
1167 |
daddu c_3,t_1 |
1168 |
sltu AT,c_3,t_1 |
1169 |
daddu t_2,AT |
1170 |
daddu c_1,t_2 |
1171 |
sltu AT,c_1,t_2 |
1172 |
daddu c_2,AT |
1173 |
dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ |
1174 |
mflo t_1 |
1175 |
mfhi t_2 |
1176 |
daddu c_3,t_1 |
1177 |
sltu AT,c_3,t_1 |
1178 |
daddu t_2,AT |
1179 |
daddu c_1,t_2 |
1180 |
sltu AT,c_1,t_2 |
1181 |
daddu c_2,AT |
1182 |
sd c_3,64(a0) /* r[8]=c3; */ |
1183 |
|
1184 |
dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ |
1185 |
mflo t_1 |
1186 |
mfhi t_2 |
1187 |
daddu c_1,t_1 |
1188 |
sltu AT,c_1,t_1 |
1189 |
daddu t_2,AT |
1190 |
daddu c_2,t_2 |
1191 |
sltu c_3,c_2,t_2 |
1192 |
dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ |
1193 |
mflo t_1 |
1194 |
mfhi t_2 |
1195 |
daddu c_1,t_1 |
1196 |
sltu AT,c_1,t_1 |
1197 |
daddu t_2,AT |
1198 |
daddu c_2,t_2 |
1199 |
sltu AT,c_2,t_2 |
1200 |
daddu c_3,AT |
1201 |
dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ |
1202 |
mflo t_1 |
1203 |
mfhi t_2 |
1204 |
daddu c_1,t_1 |
1205 |
sltu AT,c_1,t_1 |
1206 |
daddu t_2,AT |
1207 |
daddu c_2,t_2 |
1208 |
sltu AT,c_2,t_2 |
1209 |
daddu c_3,AT |
1210 |
dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ |
1211 |
mflo t_1 |
1212 |
mfhi t_2 |
1213 |
daddu c_1,t_1 |
1214 |
sltu AT,c_1,t_1 |
1215 |
daddu t_2,AT |
1216 |
daddu c_2,t_2 |
1217 |
sltu AT,c_2,t_2 |
1218 |
daddu c_3,AT |
1219 |
dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ |
1220 |
mflo t_1 |
1221 |
mfhi t_2 |
1222 |
daddu c_1,t_1 |
1223 |
sltu AT,c_1,t_1 |
1224 |
daddu t_2,AT |
1225 |
daddu c_2,t_2 |
1226 |
sltu AT,c_2,t_2 |
1227 |
daddu c_3,AT |
1228 |
dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ |
1229 |
mflo t_1 |
1230 |
mfhi t_2 |
1231 |
daddu c_1,t_1 |
1232 |
sltu AT,c_1,t_1 |
1233 |
daddu t_2,AT |
1234 |
daddu c_2,t_2 |
1235 |
sltu AT,c_2,t_2 |
1236 |
daddu c_3,AT |
1237 |
sd c_1,72(a0) /* r[9]=c1; */ |
1238 |
|
1239 |
dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ |
1240 |
mflo t_1 |
1241 |
mfhi t_2 |
1242 |
daddu c_2,t_1 |
1243 |
sltu AT,c_2,t_1 |
1244 |
daddu t_2,AT |
1245 |
daddu c_3,t_2 |
1246 |
sltu c_1,c_3,t_2 |
1247 |
dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ |
1248 |
mflo t_1 |
1249 |
mfhi t_2 |
1250 |
daddu c_2,t_1 |
1251 |
sltu AT,c_2,t_1 |
1252 |
daddu t_2,AT |
1253 |
daddu c_3,t_2 |
1254 |
sltu AT,c_3,t_2 |
1255 |
daddu c_1,AT |
1256 |
dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ |
1257 |
mflo t_1 |
1258 |
mfhi t_2 |
1259 |
daddu c_2,t_1 |
1260 |
sltu AT,c_2,t_1 |
1261 |
daddu t_2,AT |
1262 |
daddu c_3,t_2 |
1263 |
sltu AT,c_3,t_2 |
1264 |
daddu c_1,AT |
1265 |
dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ |
1266 |
mflo t_1 |
1267 |
mfhi t_2 |
1268 |
daddu c_2,t_1 |
1269 |
sltu AT,c_2,t_1 |
1270 |
daddu t_2,AT |
1271 |
daddu c_3,t_2 |
1272 |
sltu AT,c_3,t_2 |
1273 |
daddu c_1,AT |
1274 |
dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ |
1275 |
mflo t_1 |
1276 |
mfhi t_2 |
1277 |
daddu c_2,t_1 |
1278 |
sltu AT,c_2,t_1 |
1279 |
daddu t_2,AT |
1280 |
daddu c_3,t_2 |
1281 |
sltu AT,c_3,t_2 |
1282 |
daddu c_1,AT |
1283 |
sd c_2,80(a0) /* r[10]=c2; */ |
1284 |
|
1285 |
dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ |
1286 |
mflo t_1 |
1287 |
mfhi t_2 |
1288 |
daddu c_3,t_1 |
1289 |
sltu AT,c_3,t_1 |
1290 |
daddu t_2,AT |
1291 |
daddu c_1,t_2 |
1292 |
sltu c_2,c_1,t_2 |
1293 |
dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ |
1294 |
mflo t_1 |
1295 |
mfhi t_2 |
1296 |
daddu c_3,t_1 |
1297 |
sltu AT,c_3,t_1 |
1298 |
daddu t_2,AT |
1299 |
daddu c_1,t_2 |
1300 |
sltu AT,c_1,t_2 |
1301 |
daddu c_2,AT |
1302 |
dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ |
1303 |
mflo t_1 |
1304 |
mfhi t_2 |
1305 |
daddu c_3,t_1 |
1306 |
sltu AT,c_3,t_1 |
1307 |
daddu t_2,AT |
1308 |
daddu c_1,t_2 |
1309 |
sltu AT,c_1,t_2 |
1310 |
daddu c_2,AT |
1311 |
dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ |
1312 |
mflo t_1 |
1313 |
mfhi t_2 |
1314 |
daddu c_3,t_1 |
1315 |
sltu AT,c_3,t_1 |
1316 |
daddu t_2,AT |
1317 |
daddu c_1,t_2 |
1318 |
sltu AT,c_1,t_2 |
1319 |
daddu c_2,AT |
1320 |
sd c_3,88(a0) /* r[11]=c3; */ |
1321 |
|
1322 |
dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ |
1323 |
mflo t_1 |
1324 |
mfhi t_2 |
1325 |
daddu c_1,t_1 |
1326 |
sltu AT,c_1,t_1 |
1327 |
daddu t_2,AT |
1328 |
daddu c_2,t_2 |
1329 |
sltu c_3,c_2,t_2 |
1330 |
dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ |
1331 |
mflo t_1 |
1332 |
mfhi t_2 |
1333 |
daddu c_1,t_1 |
1334 |
sltu AT,c_1,t_1 |
1335 |
daddu t_2,AT |
1336 |
daddu c_2,t_2 |
1337 |
sltu AT,c_2,t_2 |
1338 |
daddu c_3,AT |
1339 |
dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ |
1340 |
mflo t_1 |
1341 |
mfhi t_2 |
1342 |
daddu c_1,t_1 |
1343 |
sltu AT,c_1,t_1 |
1344 |
daddu t_2,AT |
1345 |
daddu c_2,t_2 |
1346 |
sltu AT,c_2,t_2 |
1347 |
daddu c_3,AT |
1348 |
sd c_1,96(a0) /* r[12]=c1; */ |
1349 |
|
1350 |
dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ |
1351 |
mflo t_1 |
1352 |
mfhi t_2 |
1353 |
daddu c_2,t_1 |
1354 |
sltu AT,c_2,t_1 |
1355 |
daddu t_2,AT |
1356 |
daddu c_3,t_2 |
1357 |
sltu c_1,c_3,t_2 |
1358 |
dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ |
1359 |
mflo t_1 |
1360 |
mfhi t_2 |
1361 |
daddu c_2,t_1 |
1362 |
sltu AT,c_2,t_1 |
1363 |
daddu t_2,AT |
1364 |
daddu c_3,t_2 |
1365 |
sltu AT,c_3,t_2 |
1366 |
daddu c_1,AT |
1367 |
sd c_2,104(a0) /* r[13]=c2; */ |
1368 |
|
1369 |
dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ |
1370 |
ld s0,0(sp) |
1371 |
ld s1,8(sp) |
1372 |
ld s2,16(sp) |
1373 |
ld s3,24(sp) |
1374 |
ld s4,32(sp) |
1375 |
ld s5,40(sp) |
1376 |
mflo t_1 |
1377 |
mfhi t_2 |
1378 |
daddu c_3,t_1 |
1379 |
sltu AT,c_3,t_1 |
1380 |
daddu t_2,AT |
1381 |
daddu c_1,t_2 |
1382 |
sd c_3,112(a0) /* r[14]=c3; */ |
1383 |
sd c_1,120(a0) /* r[15]=c1; */ |
1384 |
|
1385 |
PTR_ADD sp,FRAME_SIZE |
1386 |
|
1387 |
jr ra |
1388 |
END(bn_mul_comba8) |
1389 |
|
1390 |
.align 5 |
1391 |
LEAF(bn_mul_comba4) |
1392 |
.set reorder |
1393 |
ld a_0,0(a1) |
1394 |
ld b_0,0(a2) |
1395 |
ld a_1,8(a1) |
1396 |
ld a_2,16(a1) |
1397 |
dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ |
1398 |
ld a_3,24(a1) |
1399 |
ld b_1,8(a2) |
1400 |
ld b_2,16(a2) |
1401 |
ld b_3,24(a2) |
1402 |
mflo c_1 |
1403 |
mfhi c_2 |
1404 |
sd c_1,0(a0) |
1405 |
|
1406 |
dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ |
1407 |
mflo t_1 |
1408 |
mfhi t_2 |
1409 |
daddu c_2,t_1 |
1410 |
sltu AT,c_2,t_1 |
1411 |
daddu c_3,t_2,AT |
1412 |
dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ |
1413 |
mflo t_1 |
1414 |
mfhi t_2 |
1415 |
daddu c_2,t_1 |
1416 |
sltu AT,c_2,t_1 |
1417 |
daddu t_2,AT |
1418 |
daddu c_3,t_2 |
1419 |
sltu c_1,c_3,t_2 |
1420 |
sd c_2,8(a0) |
1421 |
|
1422 |
dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ |
1423 |
mflo t_1 |
1424 |
mfhi t_2 |
1425 |
daddu c_3,t_1 |
1426 |
sltu AT,c_3,t_1 |
1427 |
daddu t_2,AT |
1428 |
daddu c_1,t_2 |
1429 |
dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ |
1430 |
mflo t_1 |
1431 |
mfhi t_2 |
1432 |
daddu c_3,t_1 |
1433 |
sltu AT,c_3,t_1 |
1434 |
daddu t_2,AT |
1435 |
daddu c_1,t_2 |
1436 |
sltu c_2,c_1,t_2 |
1437 |
dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ |
1438 |
mflo t_1 |
1439 |
mfhi t_2 |
1440 |
daddu c_3,t_1 |
1441 |
sltu AT,c_3,t_1 |
1442 |
daddu t_2,AT |
1443 |
daddu c_1,t_2 |
1444 |
sltu AT,c_1,t_2 |
1445 |
daddu c_2,AT |
1446 |
sd c_3,16(a0) |
1447 |
|
1448 |
dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ |
1449 |
mflo t_1 |
1450 |
mfhi t_2 |
1451 |
daddu c_1,t_1 |
1452 |
sltu AT,c_1,t_1 |
1453 |
daddu t_2,AT |
1454 |
daddu c_2,t_2 |
1455 |
sltu c_3,c_2,t_2 |
1456 |
dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ |
1457 |
mflo t_1 |
1458 |
mfhi t_2 |
1459 |
daddu c_1,t_1 |
1460 |
sltu AT,c_1,t_1 |
1461 |
daddu t_2,AT |
1462 |
daddu c_2,t_2 |
1463 |
sltu AT,c_2,t_2 |
1464 |
daddu c_3,AT |
1465 |
dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ |
1466 |
mflo t_1 |
1467 |
mfhi t_2 |
1468 |
daddu c_1,t_1 |
1469 |
sltu AT,c_1,t_1 |
1470 |
daddu t_2,AT |
1471 |
daddu c_2,t_2 |
1472 |
sltu AT,c_2,t_2 |
1473 |
daddu c_3,AT |
1474 |
dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ |
1475 |
mflo t_1 |
1476 |
mfhi t_2 |
1477 |
daddu c_1,t_1 |
1478 |
sltu AT,c_1,t_1 |
1479 |
daddu t_2,AT |
1480 |
daddu c_2,t_2 |
1481 |
sltu AT,c_2,t_2 |
1482 |
daddu c_3,AT |
1483 |
sd c_1,24(a0) |
1484 |
|
1485 |
dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ |
1486 |
mflo t_1 |
1487 |
mfhi t_2 |
1488 |
daddu c_2,t_1 |
1489 |
sltu AT,c_2,t_1 |
1490 |
daddu t_2,AT |
1491 |
daddu c_3,t_2 |
1492 |
sltu c_1,c_3,t_2 |
1493 |
dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ |
1494 |
mflo t_1 |
1495 |
mfhi t_2 |
1496 |
daddu c_2,t_1 |
1497 |
sltu AT,c_2,t_1 |
1498 |
daddu t_2,AT |
1499 |
daddu c_3,t_2 |
1500 |
sltu AT,c_3,t_2 |
1501 |
daddu c_1,AT |
1502 |
dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ |
1503 |
mflo t_1 |
1504 |
mfhi t_2 |
1505 |
daddu c_2,t_1 |
1506 |
sltu AT,c_2,t_1 |
1507 |
daddu t_2,AT |
1508 |
daddu c_3,t_2 |
1509 |
sltu AT,c_3,t_2 |
1510 |
daddu c_1,AT |
1511 |
sd c_2,32(a0) |
1512 |
|
1513 |
dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ |
1514 |
mflo t_1 |
1515 |
mfhi t_2 |
1516 |
daddu c_3,t_1 |
1517 |
sltu AT,c_3,t_1 |
1518 |
daddu t_2,AT |
1519 |
daddu c_1,t_2 |
1520 |
sltu c_2,c_1,t_2 |
1521 |
dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ |
1522 |
mflo t_1 |
1523 |
mfhi t_2 |
1524 |
daddu c_3,t_1 |
1525 |
sltu AT,c_3,t_1 |
1526 |
daddu t_2,AT |
1527 |
daddu c_1,t_2 |
1528 |
sltu AT,c_1,t_2 |
1529 |
daddu c_2,AT |
1530 |
sd c_3,40(a0) |
1531 |
|
1532 |
dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ |
1533 |
mflo t_1 |
1534 |
mfhi t_2 |
1535 |
daddu c_1,t_1 |
1536 |
sltu AT,c_1,t_1 |
1537 |
daddu t_2,AT |
1538 |
daddu c_2,t_2 |
1539 |
sd c_1,48(a0) |
1540 |
sd c_2,56(a0) |
1541 |
|
1542 |
jr ra |
1543 |
END(bn_mul_comba4) |
1544 |
|
1545 |
#undef a_4 |
1546 |
#undef a_5 |
1547 |
#undef a_6 |
1548 |
#undef a_7 |
1549 |
#define a_4 b_0 |
1550 |
#define a_5 b_1 |
1551 |
#define a_6 b_2 |
1552 |
#define a_7 b_3 |
1553 |
|
1554 |
.align 5 |
1555 |
LEAF(bn_sqr_comba8) |
1556 |
.set reorder |
1557 |
ld a_0,0(a1) |
1558 |
ld a_1,8(a1) |
1559 |
ld a_2,16(a1) |
1560 |
ld a_3,24(a1) |
1561 |
|
1562 |
dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ |
1563 |
ld a_4,32(a1) |
1564 |
ld a_5,40(a1) |
1565 |
ld a_6,48(a1) |
1566 |
ld a_7,56(a1) |
1567 |
mflo c_1 |
1568 |
mfhi c_2 |
1569 |
sd c_1,0(a0) |
1570 |
|
1571 |
dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ |
1572 |
mflo t_1 |
1573 |
mfhi t_2 |
1574 |
slt c_1,t_2,zero |
1575 |
dsll t_2,1 |
1576 |
slt a2,t_1,zero |
1577 |
daddu t_2,a2 |
1578 |
dsll t_1,1 |
1579 |
daddu c_2,t_1 |
1580 |
sltu AT,c_2,t_1 |
1581 |
daddu c_3,t_2,AT |
1582 |
sd c_2,8(a0) |
1583 |
|
1584 |
dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ |
1585 |
mflo t_1 |
1586 |
mfhi t_2 |
1587 |
daddu c_3,t_1 |
1588 |
sltu AT,c_3,t_1 |
1589 |
daddu c_3,t_1 |
1590 |
daddu AT,t_2 |
1591 |
sltu t_1,c_3,t_1 |
1592 |
daddu c_1,AT |
1593 |
daddu t_2,t_1 |
1594 |
sltu c_2,c_1,AT |
1595 |
daddu c_1,t_2 |
1596 |
sltu t_2,c_1,t_2 |
1597 |
daddu c_2,t_2 |
1598 |
dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ |
1599 |
mflo t_1 |
1600 |
mfhi t_2 |
1601 |
daddu c_3,t_1 |
1602 |
sltu AT,c_3,t_1 |
1603 |
daddu t_2,AT |
1604 |
daddu c_1,t_2 |
1605 |
sltu AT,c_1,t_2 |
1606 |
daddu c_2,AT |
1607 |
sd c_3,16(a0) |
1608 |
|
1609 |
dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ |
1610 |
mflo t_1 |
1611 |
mfhi t_2 |
1612 |
daddu c_1,t_1 |
1613 |
sltu AT,c_1,t_1 |
1614 |
daddu c_1,t_1 |
1615 |
daddu AT,t_2 |
1616 |
sltu t_1,c_1,t_1 |
1617 |
daddu c_2,AT |
1618 |
daddu t_2,t_1 |
1619 |
sltu c_3,c_2,AT |
1620 |
daddu c_2,t_2 |
1621 |
sltu t_2,c_2,t_2 |
1622 |
daddu c_3,t_2 |
1623 |
dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ |
1624 |
mflo t_1 |
1625 |
mfhi t_2 |
1626 |
daddu c_1,t_1 |
1627 |
sltu AT,c_1,t_1 |
1628 |
daddu c_1,t_1 |
1629 |
daddu AT,t_2 |
1630 |
sltu t_1,c_1,t_1 |
1631 |
daddu c_2,AT |
1632 |
daddu t_2,t_1 |
1633 |
sltu AT,c_2,AT |
1634 |
daddu c_2,t_2 |
1635 |
daddu c_3,AT |
1636 |
sltu t_2,c_2,t_2 |
1637 |
daddu c_3,t_2 |
1638 |
sd c_1,24(a0) |
1639 |
|
1640 |
dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ |
1641 |
mflo t_1 |
1642 |
mfhi t_2 |
1643 |
daddu c_2,t_1 |
1644 |
sltu AT,c_2,t_1 |
1645 |
daddu c_2,t_1 |
1646 |
daddu AT,t_2 |
1647 |
sltu t_1,c_2,t_1 |
1648 |
daddu c_3,AT |
1649 |
daddu t_2,t_1 |
1650 |
sltu c_1,c_3,AT |
1651 |
daddu c_3,t_2 |
1652 |
sltu t_2,c_3,t_2 |
1653 |
daddu c_1,t_2 |
1654 |
dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ |
1655 |
mflo t_1 |
1656 |
mfhi t_2 |
1657 |
daddu c_2,t_1 |
1658 |
sltu AT,c_2,t_1 |
1659 |
daddu c_2,t_1 |
1660 |
daddu AT,t_2 |
1661 |
sltu t_1,c_2,t_1 |
1662 |
daddu c_3,AT |
1663 |
daddu t_2,t_1 |
1664 |
sltu AT,c_3,AT |
1665 |
daddu c_3,t_2 |
1666 |
daddu c_1,AT |
1667 |
sltu t_2,c_3,t_2 |
1668 |
daddu c_1,t_2 |
1669 |
dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ |
1670 |
mflo t_1 |
1671 |
mfhi t_2 |
1672 |
daddu c_2,t_1 |
1673 |
sltu AT,c_2,t_1 |
1674 |
daddu t_2,AT |
1675 |
daddu c_3,t_2 |
1676 |
sltu AT,c_3,t_2 |
1677 |
daddu c_1,AT |
1678 |
sd c_2,32(a0) |
1679 |
|
1680 |
dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ |
1681 |
mflo t_1 |
1682 |
mfhi t_2 |
1683 |
daddu c_3,t_1 |
1684 |
sltu AT,c_3,t_1 |
1685 |
daddu c_3,t_1 |
1686 |
daddu AT,t_2 |
1687 |
sltu t_1,c_3,t_1 |
1688 |
daddu c_1,AT |
1689 |
daddu t_2,t_1 |
1690 |
sltu c_2,c_1,AT |
1691 |
daddu c_1,t_2 |
1692 |
sltu t_2,c_1,t_2 |
1693 |
daddu c_2,t_2 |
1694 |
dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ |
1695 |
mflo t_1 |
1696 |
mfhi t_2 |
1697 |
daddu c_3,t_1 |
1698 |
sltu AT,c_3,t_1 |
1699 |
daddu c_3,t_1 |
1700 |
daddu AT,t_2 |
1701 |
sltu t_1,c_3,t_1 |
1702 |
daddu c_1,AT |
1703 |
daddu t_2,t_1 |
1704 |
sltu AT,c_1,AT |
1705 |
daddu c_1,t_2 |
1706 |
daddu c_2,AT |
1707 |
sltu t_2,c_1,t_2 |
1708 |
daddu c_2,t_2 |
1709 |
dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ |
1710 |
mflo t_1 |
1711 |
mfhi t_2 |
1712 |
daddu c_3,t_1 |
1713 |
sltu AT,c_3,t_1 |
1714 |
daddu c_3,t_1 |
1715 |
daddu AT,t_2 |
1716 |
sltu t_1,c_3,t_1 |
1717 |
daddu c_1,AT |
1718 |
daddu t_2,t_1 |
1719 |
sltu AT,c_1,AT |
1720 |
daddu c_1,t_2 |
1721 |
daddu c_2,AT |
1722 |
sltu t_2,c_1,t_2 |
1723 |
daddu c_2,t_2 |
1724 |
sd c_3,40(a0) |
1725 |
|
1726 |
dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ |
1727 |
mflo t_1 |
1728 |
mfhi t_2 |
1729 |
daddu c_1,t_1 |
1730 |
sltu AT,c_1,t_1 |
1731 |
daddu c_1,t_1 |
1732 |
daddu AT,t_2 |
1733 |
sltu t_1,c_1,t_1 |
1734 |
daddu c_2,AT |
1735 |
daddu t_2,t_1 |
1736 |
sltu c_3,c_2,AT |
1737 |
daddu c_2,t_2 |
1738 |
sltu t_2,c_2,t_2 |
1739 |
daddu c_3,t_2 |
1740 |
dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ |
1741 |
mflo t_1 |
1742 |
mfhi t_2 |
1743 |
daddu c_1,t_1 |
1744 |
sltu AT,c_1,t_1 |
1745 |
daddu c_1,t_1 |
1746 |
daddu AT,t_2 |
1747 |
sltu t_1,c_1,t_1 |
1748 |
daddu c_2,AT |
1749 |
daddu t_2,t_1 |
1750 |
sltu AT,c_2,AT |
1751 |
daddu c_2,t_2 |
1752 |
daddu c_3,AT |
1753 |
sltu t_2,c_2,t_2 |
1754 |
daddu c_3,t_2 |
1755 |
dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ |
1756 |
mflo t_1 |
1757 |
mfhi t_2 |
1758 |
daddu c_1,t_1 |
1759 |
sltu AT,c_1,t_1 |
1760 |
daddu c_1,t_1 |
1761 |
daddu AT,t_2 |
1762 |
sltu t_1,c_1,t_1 |
1763 |
daddu c_2,AT |
1764 |
daddu t_2,t_1 |
1765 |
sltu AT,c_2,AT |
1766 |
daddu c_2,t_2 |
1767 |
daddu c_3,AT |
1768 |
sltu t_2,c_2,t_2 |
1769 |
daddu c_3,t_2 |
1770 |
dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ |
1771 |
mflo t_1 |
1772 |
mfhi t_2 |
1773 |
daddu c_1,t_1 |
1774 |
sltu AT,c_1,t_1 |
1775 |
daddu t_2,AT |
1776 |
daddu c_2,t_2 |
1777 |
sltu AT,c_2,t_2 |
1778 |
daddu c_3,AT |
1779 |
sd c_1,48(a0) |
1780 |
|
1781 |
dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ |
1782 |
mflo t_1 |
1783 |
mfhi t_2 |
1784 |
daddu c_2,t_1 |
1785 |
sltu AT,c_2,t_1 |
1786 |
daddu c_2,t_1 |
1787 |
daddu AT,t_2 |
1788 |
sltu t_1,c_2,t_1 |
1789 |
daddu c_3,AT |
1790 |
daddu t_2,t_1 |
1791 |
sltu c_1,c_3,AT |
1792 |
daddu c_3,t_2 |
1793 |
sltu t_2,c_3,t_2 |
1794 |
daddu c_1,t_2 |
1795 |
dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ |
1796 |
mflo t_1 |
1797 |
mfhi t_2 |
1798 |
daddu c_2,t_1 |
1799 |
sltu AT,c_2,t_1 |
1800 |
daddu c_2,t_1 |
1801 |
daddu AT,t_2 |
1802 |
sltu t_1,c_2,t_1 |
1803 |
daddu c_3,AT |
1804 |
daddu t_2,t_1 |
1805 |
sltu AT,c_3,AT |
1806 |
daddu c_3,t_2 |
1807 |
daddu c_1,AT |
1808 |
sltu t_2,c_3,t_2 |
1809 |
daddu c_1,t_2 |
1810 |
dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ |
1811 |
mflo t_1 |
1812 |
mfhi t_2 |
1813 |
daddu c_2,t_1 |
1814 |
sltu AT,c_2,t_1 |
1815 |
daddu c_2,t_1 |
1816 |
daddu AT,t_2 |
1817 |
sltu t_1,c_2,t_1 |
1818 |
daddu c_3,AT |
1819 |
daddu t_2,t_1 |
1820 |
sltu AT,c_3,AT |
1821 |
daddu c_3,t_2 |
1822 |
daddu c_1,AT |
1823 |
sltu t_2,c_3,t_2 |
1824 |
daddu c_1,t_2 |
1825 |
dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ |
1826 |
mflo t_1 |
1827 |
mfhi t_2 |
1828 |
daddu c_2,t_1 |
1829 |
sltu AT,c_2,t_1 |
1830 |
daddu c_2,t_1 |
1831 |
daddu AT,t_2 |
1832 |
sltu t_1,c_2,t_1 |
1833 |
daddu c_3,AT |
1834 |
daddu t_2,t_1 |
1835 |
sltu AT,c_3,AT |
1836 |
daddu c_3,t_2 |
1837 |
daddu c_1,AT |
1838 |
sltu t_2,c_3,t_2 |
1839 |
daddu c_1,t_2 |
1840 |
sd c_2,56(a0) |
1841 |
|
1842 |
dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ |
1843 |
mflo t_1 |
1844 |
mfhi t_2 |
1845 |
daddu c_3,t_1 |
1846 |
sltu AT,c_3,t_1 |
1847 |
daddu c_3,t_1 |
1848 |
daddu AT,t_2 |
1849 |
sltu t_1,c_3,t_1 |
1850 |
daddu c_1,AT |
1851 |
daddu t_2,t_1 |
1852 |
sltu c_2,c_1,AT |
1853 |
daddu c_1,t_2 |
1854 |
sltu t_2,c_1,t_2 |
1855 |
daddu c_2,t_2 |
1856 |
dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ |
1857 |
mflo t_1 |
1858 |
mfhi t_2 |
1859 |
daddu c_3,t_1 |
1860 |
sltu AT,c_3,t_1 |
1861 |
daddu c_3,t_1 |
1862 |
daddu AT,t_2 |
1863 |
sltu t_1,c_3,t_1 |
1864 |
daddu c_1,AT |
1865 |
daddu t_2,t_1 |
1866 |
sltu AT,c_1,AT |
1867 |
daddu c_1,t_2 |
1868 |
daddu c_2,AT |
1869 |
sltu t_2,c_1,t_2 |
1870 |
daddu c_2,t_2 |
1871 |
dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ |
1872 |
mflo t_1 |
1873 |
mfhi t_2 |
1874 |
daddu c_3,t_1 |
1875 |
sltu AT,c_3,t_1 |
1876 |
daddu c_3,t_1 |
1877 |
daddu AT,t_2 |
1878 |
sltu t_1,c_3,t_1 |
1879 |
daddu c_1,AT |
1880 |
daddu t_2,t_1 |
1881 |
sltu AT,c_1,AT |
1882 |
daddu c_1,t_2 |
1883 |
daddu c_2,AT |
1884 |
sltu t_2,c_1,t_2 |
1885 |
daddu c_2,t_2 |
1886 |
dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ |
1887 |
mflo t_1 |
1888 |
mfhi t_2 |
1889 |
daddu c_3,t_1 |
1890 |
sltu AT,c_3,t_1 |
1891 |
daddu t_2,AT |
1892 |
daddu c_1,t_2 |
1893 |
sltu AT,c_1,t_2 |
1894 |
daddu c_2,AT |
1895 |
sd c_3,64(a0) |
1896 |
|
1897 |
dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ |
1898 |
mflo t_1 |
1899 |
mfhi t_2 |
1900 |
daddu c_1,t_1 |
1901 |
sltu AT,c_1,t_1 |
1902 |
daddu c_1,t_1 |
1903 |
daddu AT,t_2 |
1904 |
sltu t_1,c_1,t_1 |
1905 |
daddu c_2,AT |
1906 |
daddu t_2,t_1 |
1907 |
sltu c_3,c_2,AT |
1908 |
daddu c_2,t_2 |
1909 |
sltu t_2,c_2,t_2 |
1910 |
daddu c_3,t_2 |
1911 |
dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ |
1912 |
mflo t_1 |
1913 |
mfhi t_2 |
1914 |
daddu c_1,t_1 |
1915 |
sltu AT,c_1,t_1 |
1916 |
daddu c_1,t_1 |
1917 |
daddu AT,t_2 |
1918 |
sltu t_1,c_1,t_1 |
1919 |
daddu c_2,AT |
1920 |
daddu t_2,t_1 |
1921 |
sltu AT,c_2,AT |
1922 |
daddu c_2,t_2 |
1923 |
daddu c_3,AT |
1924 |
sltu t_2,c_2,t_2 |
1925 |
daddu c_3,t_2 |
1926 |
dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ |
1927 |
mflo t_1 |
1928 |
mfhi t_2 |
1929 |
daddu c_1,t_1 |
1930 |
sltu AT,c_1,t_1 |
1931 |
daddu c_1,t_1 |
1932 |
daddu AT,t_2 |
1933 |
sltu t_1,c_1,t_1 |
1934 |
daddu c_2,AT |
1935 |
daddu t_2,t_1 |
1936 |
sltu AT,c_2,AT |
1937 |
daddu c_2,t_2 |
1938 |
daddu c_3,AT |
1939 |
sltu t_2,c_2,t_2 |
1940 |
daddu c_3,t_2 |
1941 |
sd c_1,72(a0) |
1942 |
|
1943 |
dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ |
1944 |
mflo t_1 |
1945 |
mfhi t_2 |
1946 |
daddu c_2,t_1 |
1947 |
sltu AT,c_2,t_1 |
1948 |
daddu c_2,t_1 |
1949 |
daddu AT,t_2 |
1950 |
sltu t_1,c_2,t_1 |
1951 |
daddu c_3,AT |
1952 |
daddu t_2,t_1 |
1953 |
sltu c_1,c_3,AT |
1954 |
daddu c_3,t_2 |
1955 |
sltu t_2,c_3,t_2 |
1956 |
daddu c_1,t_2 |
1957 |
dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ |
1958 |
mflo t_1 |
1959 |
mfhi t_2 |
1960 |
daddu c_2,t_1 |
1961 |
sltu AT,c_2,t_1 |
1962 |
daddu c_2,t_1 |
1963 |
daddu AT,t_2 |
1964 |
sltu t_1,c_2,t_1 |
1965 |
daddu c_3,AT |
1966 |
daddu t_2,t_1 |
1967 |
sltu AT,c_3,AT |
1968 |
daddu c_3,t_2 |
1969 |
daddu c_1,AT |
1970 |
sltu t_2,c_3,t_2 |
1971 |
daddu c_1,t_2 |
1972 |
dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ |
1973 |
mflo t_1 |
1974 |
mfhi t_2 |
1975 |
daddu c_2,t_1 |
1976 |
sltu AT,c_2,t_1 |
1977 |
daddu t_2,AT |
1978 |
daddu c_3,t_2 |
1979 |
sltu AT,c_3,t_2 |
1980 |
daddu c_1,AT |
1981 |
sd c_2,80(a0) |
1982 |
|
1983 |
dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ |
1984 |
mflo t_1 |
1985 |
mfhi t_2 |
1986 |
daddu c_3,t_1 |
1987 |
sltu AT,c_3,t_1 |
1988 |
daddu c_3,t_1 |
1989 |
daddu AT,t_2 |
1990 |
sltu t_1,c_3,t_1 |
1991 |
daddu c_1,AT |
1992 |
daddu t_2,t_1 |
1993 |
sltu c_2,c_1,AT |
1994 |
daddu c_1,t_2 |
1995 |
sltu t_2,c_1,t_2 |
1996 |
daddu c_2,t_2 |
1997 |
dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ |
1998 |
mflo t_1 |
1999 |
mfhi t_2 |
2000 |
daddu c_3,t_1 |
2001 |
sltu AT,c_3,t_1 |
2002 |
daddu c_3,t_1 |
2003 |
daddu AT,t_2 |
2004 |
sltu t_1,c_3,t_1 |
2005 |
daddu c_1,AT |
2006 |
daddu t_2,t_1 |
2007 |
sltu AT,c_1,AT |
2008 |
daddu c_1,t_2 |
2009 |
daddu c_2,AT |
2010 |
sltu t_2,c_1,t_2 |
2011 |
daddu c_2,t_2 |
2012 |
sd c_3,88(a0) |
2013 |
|
2014 |
dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ |
2015 |
mflo t_1 |
2016 |
mfhi t_2 |
2017 |
daddu c_1,t_1 |
2018 |
sltu AT,c_1,t_1 |
2019 |
daddu c_1,t_1 |
2020 |
daddu AT,t_2 |
2021 |
sltu t_1,c_1,t_1 |
2022 |
daddu c_2,AT |
2023 |
daddu t_2,t_1 |
2024 |
sltu c_3,c_2,AT |
2025 |
daddu c_2,t_2 |
2026 |
sltu t_2,c_2,t_2 |
2027 |
daddu c_3,t_2 |
2028 |
dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ |
2029 |
mflo t_1 |
2030 |
mfhi t_2 |
2031 |
daddu c_1,t_1 |
2032 |
sltu AT,c_1,t_1 |
2033 |
daddu t_2,AT |
2034 |
daddu c_2,t_2 |
2035 |
sltu AT,c_2,t_2 |
2036 |
daddu c_3,AT |
2037 |
sd c_1,96(a0) |
2038 |
|
2039 |
dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ |
2040 |
mflo t_1 |
2041 |
mfhi t_2 |
2042 |
daddu c_2,t_1 |
2043 |
sltu AT,c_2,t_1 |
2044 |
daddu c_2,t_1 |
2045 |
daddu AT,t_2 |
2046 |
sltu t_1,c_2,t_1 |
2047 |
daddu c_3,AT |
2048 |
daddu t_2,t_1 |
2049 |
sltu c_1,c_3,AT |
2050 |
daddu c_3,t_2 |
2051 |
sltu t_2,c_3,t_2 |
2052 |
daddu c_1,t_2 |
2053 |
sd c_2,104(a0) |
2054 |
|
2055 |
dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ |
2056 |
mflo t_1 |
2057 |
mfhi t_2 |
2058 |
daddu c_3,t_1 |
2059 |
sltu AT,c_3,t_1 |
2060 |
daddu t_2,AT |
2061 |
daddu c_1,t_2 |
2062 |
sd c_3,112(a0) |
2063 |
sd c_1,120(a0) |
2064 |
|
2065 |
jr ra |
2066 |
END(bn_sqr_comba8) |
2067 |
|
2068 |
.align 5 |
2069 |
LEAF(bn_sqr_comba4) |
2070 |
.set reorder |
2071 |
ld a_0,0(a1) |
2072 |
ld a_1,8(a1) |
2073 |
dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ |
2074 |
ld a_2,16(a1) |
2075 |
ld a_3,24(a1) |
2076 |
mflo c_1 |
2077 |
mfhi c_2 |
2078 |
sd c_1,0(a0) |
2079 |
|
2080 |
dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ |
2081 |
mflo t_1 |
2082 |
mfhi t_2 |
2083 |
slt c_1,t_2,zero |
2084 |
dsll t_2,1 |
2085 |
slt a2,t_1,zero |
2086 |
daddu t_2,a2 |
2087 |
dsll t_1,1 |
2088 |
daddu c_2,t_1 |
2089 |
sltu AT,c_2,t_1 |
2090 |
daddu c_3,t_2,AT |
2091 |
sd c_2,8(a0) |
2092 |
|
2093 |
dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ |
2094 |
mflo t_1 |
2095 |
mfhi t_2 |
2096 |
daddu c_3,t_1 |
2097 |
sltu AT,c_3,t_1 |
2098 |
daddu c_3,t_1 |
2099 |
daddu AT,t_2 |
2100 |
sltu t_1,c_3,t_1 |
2101 |
daddu c_1,AT |
2102 |
daddu t_2,t_1 |
2103 |
sltu c_2,c_1,AT |
2104 |
daddu c_1,t_2 |
2105 |
sltu t_2,c_1,t_2 |
2106 |
daddu c_2,t_2 |
2107 |
dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ |
2108 |
mflo t_1 |
2109 |
mfhi t_2 |
2110 |
daddu c_3,t_1 |
2111 |
sltu AT,c_3,t_1 |
2112 |
daddu t_2,AT |
2113 |
daddu c_1,t_2 |
2114 |
sltu AT,c_1,t_2 |
2115 |
daddu c_2,AT |
2116 |
sd c_3,16(a0) |
2117 |
|
2118 |
dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ |
2119 |
mflo t_1 |
2120 |
mfhi t_2 |
2121 |
daddu c_1,t_1 |
2122 |
sltu AT,c_1,t_1 |
2123 |
daddu c_1,t_1 |
2124 |
daddu AT,t_2 |
2125 |
sltu t_1,c_1,t_1 |
2126 |
daddu c_2,AT |
2127 |
daddu t_2,t_1 |
2128 |
sltu c_3,c_2,AT |
2129 |
daddu c_2,t_2 |
2130 |
sltu t_2,c_2,t_2 |
2131 |
daddu c_3,t_2 |
2132 |
dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ |
2133 |
mflo t_1 |
2134 |
mfhi t_2 |
2135 |
daddu c_1,t_1 |
2136 |
sltu AT,c_1,t_1 |
2137 |
daddu c_1,t_1 |
2138 |
daddu AT,t_2 |
2139 |
sltu t_1,c_1,t_1 |
2140 |
daddu c_2,AT |
2141 |
daddu t_2,t_1 |
2142 |
sltu AT,c_2,AT |
2143 |
daddu c_2,t_2 |
2144 |
daddu c_3,AT |
2145 |
sltu t_2,c_2,t_2 |
2146 |
daddu c_3,t_2 |
2147 |
sd c_1,24(a0) |
2148 |
|
2149 |
dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ |
2150 |
mflo t_1 |
2151 |
mfhi t_2 |
2152 |
daddu c_2,t_1 |
2153 |
sltu AT,c_2,t_1 |
2154 |
daddu c_2,t_1 |
2155 |
daddu AT,t_2 |
2156 |
sltu t_1,c_2,t_1 |
2157 |
daddu c_3,AT |
2158 |
daddu t_2,t_1 |
2159 |
sltu c_1,c_3,AT |
2160 |
daddu c_3,t_2 |
2161 |
sltu t_2,c_3,t_2 |
2162 |
daddu c_1,t_2 |
2163 |
dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ |
2164 |
mflo t_1 |
2165 |
mfhi t_2 |
2166 |
daddu c_2,t_1 |
2167 |
sltu AT,c_2,t_1 |
2168 |
daddu t_2,AT |
2169 |
daddu c_3,t_2 |
2170 |
sltu AT,c_3,t_2 |
2171 |
daddu c_1,AT |
2172 |
sd c_2,32(a0) |
2173 |
|
2174 |
dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ |
2175 |
mflo t_1 |
2176 |
mfhi t_2 |
2177 |
daddu c_3,t_1 |
2178 |
sltu AT,c_3,t_1 |
2179 |
daddu c_3,t_1 |
2180 |
daddu AT,t_2 |
2181 |
sltu t_1,c_3,t_1 |
2182 |
daddu c_1,AT |
2183 |
daddu t_2,t_1 |
2184 |
sltu c_2,c_1,AT |
2185 |
daddu c_1,t_2 |
2186 |
sltu t_2,c_1,t_2 |
2187 |
daddu c_2,t_2 |
2188 |
sd c_3,40(a0) |
2189 |
|
2190 |
dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ |
2191 |
mflo t_1 |
2192 |
mfhi t_2 |
2193 |
daddu c_1,t_1 |
2194 |
sltu AT,c_1,t_1 |
2195 |
daddu t_2,AT |
2196 |
daddu c_2,t_2 |
2197 |
sd c_1,48(a0) |
2198 |
sd c_2,56(a0) |
2199 |
|
2200 |
jr ra |
2201 |
END(bn_sqr_comba4) |