Merge lp:~will-newton/cortex-strings/cortex-strings into lp:cortex-strings
- cortex-strings
- Merge into trunk
Proposed by
Will Newton
Status: | Merged |
---|---|
Approved by: | Will Newton |
Approved revision: | 100 |
Merged at revision: | 100 |
Proposed branch: | lp:~will-newton/cortex-strings/cortex-strings |
Merge into: | lp:cortex-strings |
Diff against target: |
1062 lines (+604/-399) 4 files modified
Makefile.am (+11/-11) configure.ac (+8/-0) src/linaro-a9/memcpy-hybrid.S (+0/-152) src/linaro-a9/memcpy.S (+585/-236) |
To merge this branch: | bzr merge lp:~will-newton/cortex-strings/cortex-strings |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Will Newton (community) | Approve | ||
Review via email: mp+155445@code.launchpad.net |
Commit message
Description of the change
Integrate new NEON/VFP/ARM memcpy implementation.
To post a comment you must log in.
Revision history for this message
Will Newton (will-newton) : | # |
review:
Approve
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'Makefile.am' |
2 | --- Makefile.am 2013-01-07 15:57:20 +0000 |
3 | +++ Makefile.am 2013-03-26 10:23:26 +0000 |
4 | @@ -171,15 +171,15 @@ |
5 | |
6 | if WITH_NEON |
7 | # Pull in the NEON specific files |
8 | -neon_sources = \ |
9 | - src/linaro-a9/memcpy-hybrid.S |
10 | neon_bionic_sources = \ |
11 | reference/bionic/memcpy.S |
12 | -neon_cppflags = -mfpu=neon |
13 | -neon_dirs = neon |
14 | -else |
15 | -alternate_sources = \ |
16 | - src/linaro-a9/memcpy.S |
17 | +fpu_flags = -mfpu=neon |
18 | +else |
19 | +if WITH_VFP |
20 | +fpu_flags = -mfpu=vfp |
21 | +else |
22 | +fpu_flags = -msoft-float |
23 | +endif |
24 | endif |
25 | |
26 | # Benchmarks and example programs |
27 | @@ -200,13 +200,12 @@ |
28 | |
29 | # Main library |
30 | libcortex_strings_la_SOURCES = \ |
31 | - $(neon_sources) \ |
32 | - $(alternate_sources) \ |
33 | src/thumb-2/strcpy.c \ |
34 | src/linaro-a9/memchr.S \ |
35 | src/linaro-a9/strchr.S \ |
36 | src/linaro-a9/strlen.S \ |
37 | - src/linaro-a9/memset.S |
38 | + src/linaro-a9/memset.S \ |
39 | + src/linaro-a9/memcpy.S |
40 | |
41 | # Libraries containing the difference reference versions |
42 | libbionic_a_SOURCES = \ |
43 | @@ -259,7 +258,8 @@ |
44 | try_newlib_xscale_SOURCES = |
45 | try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt |
46 | |
47 | -AM_CPPFLAGS = $(neon_cppflags) |
48 | +AM_CPPFLAGS = $(fpu_flags) |
49 | +AM_LDFLAGS = $(fpu_flags) |
50 | |
51 | endif |
52 | |
53 | |
54 | === modified file 'configure.ac' |
55 | --- configure.ac 2012-12-12 02:30:07 +0000 |
56 | +++ configure.ac 2013-03-26 10:23:26 +0000 |
57 | @@ -77,4 +77,12 @@ |
58 | AC_SUBST(with_neon) |
59 | AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes) |
60 | |
61 | +AC_ARG_WITH([vfp], |
62 | + AC_HELP_STRING([--with-vfp], |
63 | + [include VFP specific routines @<:@default=yes@:>@]), |
64 | + [with_vfp=$withval], |
65 | + [with_vfp=yes]) |
66 | +AC_SUBST(with_vfp) |
67 | +AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes) |
68 | + |
69 | AC_OUTPUT |
70 | |
71 | === removed file 'src/linaro-a9/memcpy-hybrid.S' |
72 | --- src/linaro-a9/memcpy-hybrid.S 2011-09-08 17:20:49 +0000 |
73 | +++ src/linaro-a9/memcpy-hybrid.S 1970-01-01 00:00:00 +0000 |
74 | @@ -1,152 +0,0 @@ |
75 | -/* Copyright (c) 2010-2011, Linaro Limited |
76 | - All rights reserved. |
77 | - |
78 | - Redistribution and use in source and binary forms, with or without |
79 | - modification, are permitted provided that the following conditions |
80 | - are met: |
81 | - |
82 | - * Redistributions of source code must retain the above copyright |
83 | - notice, this list of conditions and the following disclaimer. |
84 | - |
85 | - * Redistributions in binary form must reproduce the above copyright |
86 | - notice, this list of conditions and the following disclaimer in the |
87 | - documentation and/or other materials provided with the distribution. |
88 | - |
89 | - * Neither the name of Linaro Limited nor the names of its |
90 | - contributors may be used to endorse or promote products derived |
91 | - from this software without specific prior written permission. |
92 | - |
93 | - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
94 | - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
95 | - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
96 | - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
97 | - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
98 | - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
99 | - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
100 | - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
101 | - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
102 | - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
103 | - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
104 | - |
105 | - Written by Dave Gilbert <david.gilbert@linaro.org> |
106 | - |
107 | - This memcpy routine is optimised on a Cortex-A9 and should work on |
108 | - all ARMv7 processors with NEON. */ |
109 | - |
110 | -@ 2011-09-01 david.gilbert@linaro.org |
111 | -@ Extracted from local git 2f11b436 |
112 | - |
113 | - .syntax unified |
114 | - .arch armv7-a |
115 | - |
116 | -@ this lets us check a flag in a 00/ff byte easily in either endianness |
117 | -#ifdef __ARMEB__ |
118 | -#define CHARTSTMASK(c) 1<<(31-(c*8)) |
119 | -#else |
120 | -#define CHARTSTMASK(c) 1<<(c*8) |
121 | -#endif |
122 | - .text |
123 | - .thumb |
124 | - |
125 | -@ --------------------------------------------------------------------------- |
126 | - .thumb_func |
127 | - .align 2 |
128 | - .p2align 4,,15 |
129 | - .global memcpy |
130 | - .type memcpy,%function |
131 | -memcpy: |
132 | - @ r0 = dest |
133 | - @ r1 = source |
134 | - @ r2 = count |
135 | - @ returns dest in r0 |
136 | - @ Overlaps of source/dest not allowed according to spec |
137 | - @ Note this routine relies on v7 misaligned loads/stores |
138 | - pld [r1] |
139 | - mov r12, r0 @ stash original r0 |
140 | - cmp r2,#32 |
141 | - blt 10f @ take the small copy case separately |
142 | - |
143 | - @ test for either source or destination being misaligned |
144 | - @ (We only rely on word align) |
145 | - tst r0,#3 |
146 | - it eq |
147 | - tsteq r1,#3 |
148 | - bne 30f @ misaligned case |
149 | - |
150 | -4: |
151 | - @ at this point we are word (or better) aligned and have at least |
152 | - @ 32 bytes to play with |
153 | - |
154 | - @ If it's a huge copy, try Neon |
155 | - cmp r2, #128*1024 |
156 | - bge 35f @ Sharing general non-aligned case here, aligned could be faster |
157 | - |
158 | - push {r3,r4,r5,r6,r7,r8,r10,r11} |
159 | -5: |
160 | - ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11} |
161 | - sub r2,r2,#32 |
162 | - pld [r1,#96] |
163 | - cmp r2,#32 |
164 | - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} |
165 | - bge 5b |
166 | - |
167 | - pop {r3,r4,r5,r6,r7,r8,r10,r11} |
168 | - @ We are now down to less than 32 bytes |
169 | - cbz r2,15f @ quick exit for the case where we copied a multiple of 32 |
170 | - |
171 | -10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes) |
172 | - cmp r2,#4 |
173 | - blt 12f |
174 | -11: |
175 | - sub r2,r2,#4 |
176 | - cmp r2,#4 |
177 | - ldr r3, [r1],#4 |
178 | - str r3, [r0],#4 |
179 | - bge 11b |
180 | -12: |
181 | - tst r2,#2 |
182 | - itt ne |
183 | - ldrhne r3, [r1],#2 |
184 | - strhne r3, [r0],#2 |
185 | - |
186 | - tst r2,#1 |
187 | - itt ne |
188 | - ldrbne r3, [r1],#1 |
189 | - strbne r3, [r0],#1 |
190 | - |
191 | -15: @ exit |
192 | - mov r0,r12 @ restore r0 |
193 | - bx lr |
194 | - |
195 | - .align 2 |
196 | - .p2align 4,,15 |
197 | -30: @ non-aligned - at least 32 bytes to play with |
198 | - @ Test for co-misalignment |
199 | - eor r3, r0, r1 |
200 | - tst r3,#3 |
201 | - beq 50f |
202 | - |
203 | - @ Use Neon for misaligned |
204 | -35: |
205 | - vld1.8 {d0,d1,d2,d3}, [r1]! |
206 | - sub r2,r2,#32 |
207 | - cmp r2,#32 |
208 | - pld [r1,#96] |
209 | - vst1.8 {d0,d1,d2,d3}, [r0]! |
210 | - bge 35b |
211 | - b 10b @ TODO: Probably a bad idea to switch to ARM at this point |
212 | - |
213 | - .align 2 |
214 | - .p2align 4,,15 |
215 | -50: @ Co-misaligned |
216 | - @ At this point we've got at least 32 bytes |
217 | -51: |
218 | - ldrb r3,[r1],#1 |
219 | - sub r2,r2,#1 |
220 | - strb r3,[r0],#1 |
221 | - tst r0,#7 |
222 | - bne 51b |
223 | - |
224 | - cmp r2,#32 |
225 | - blt 10b |
226 | - b 4b |
227 | |
228 | === modified file 'src/linaro-a9/memcpy.S' |
229 | --- src/linaro-a9/memcpy.S 2011-09-09 00:20:15 +0000 |
230 | +++ src/linaro-a9/memcpy.S 2013-03-26 10:23:26 +0000 |
231 | @@ -1,4 +1,4 @@ |
232 | -/* Copyright (c) 2010-2011, Linaro Limited |
233 | +/* Copyright (c) 2013, Linaro Limited |
234 | All rights reserved. |
235 | |
236 | Redistribution and use in source and binary forms, with or without |
237 | @@ -28,241 +28,590 @@ |
238 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
239 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
240 | |
241 | - Written by Dave Gilbert <david.gilbert@linaro.org> |
242 | - |
243 | - This memcpy routine is optimised on a Cortex-A9 and should work on |
244 | - all ARMv7 processors. */ |
245 | - |
246 | -@ 2011-09-01 david.gilbert@linaro.org |
247 | -@ Extracted from local git 2f11b436 |
248 | + This memcpy routine is optimised for Cortex-A cores and takes advantage |
249 | + of VFP or NEON when built with the appropriate flags. |
250 | + |
251 | + Assumptions: |
252 | + |
253 | + ARMv6 (ARMv7-a if using Neon) |
254 | + ARM state |
255 | + Unaligned accesses |
256 | + LDRD/STRD support unaligned word accesses |
257 | + Not tested on big-endian |
258 | + |
259 | + */ |
260 | |
261 | .syntax unified |
262 | - .arch armv7-a |
263 | - |
264 | -@ this lets us check a flag in a 00/ff byte easily in either endianness |
265 | -#ifdef __ARMEB__ |
266 | -#define CHARTSTMASK(c) 1<<(31-(c*8)) |
267 | -#else |
268 | -#define CHARTSTMASK(c) 1<<(c*8) |
269 | -#endif |
270 | + /* This implementation requires ARM state. */ |
271 | + .arm |
272 | + |
273 | +#ifdef __ARM_NEON__ |
274 | + |
275 | + .fpu neon |
276 | + .arch armv7-a |
277 | +# define FRAME_SIZE 4 |
278 | +# define USE_VFP |
279 | +# define USE_NEON |
280 | + |
281 | +#elif !defined (__SOFTFP__) |
282 | + |
283 | + .arch armv6 |
284 | + .fpu vfpv2 |
285 | +# define FRAME_SIZE 32 |
286 | +# define USE_VFP |
287 | + |
288 | +#else |
289 | + .arch armv6 |
290 | +# define FRAME_SIZE 32 |
291 | + |
292 | +#endif |
293 | + |
294 | +/* Old versions of GAS incorrectly implement the NEON align semantics. */ |
295 | +#ifdef BROKEN_ASM_NEON_ALIGN |
296 | +#define ALIGN(addr, align) addr,:align |
297 | +#else |
298 | +#define ALIGN(addr, align) addr:align |
299 | +#endif |
300 | + |
301 | +#define PC_OFFSET 8 /* PC pipeline compensation. */ |
302 | +#define INSN_SIZE 4 |
303 | + |
304 | +/* Call parameters. */ |
305 | +#define dstin r0 |
306 | +#define src r1 |
307 | +#define count r2 |
308 | + |
309 | +/* Locals. */ |
310 | +#define tmp1 r3 |
311 | +#define dst ip |
312 | +#define tmp2 r10 |
313 | + |
314 | +#ifndef USE_NEON |
315 | +/* For bulk copies using GP registers. */ |
316 | +#define A_l r2 /* Call-clobbered. */ |
317 | +#define A_h r3 /* Call-clobbered. */ |
318 | +#define B_l r4 |
319 | +#define B_h r5 |
320 | +#define C_l r6 |
321 | +#define C_h r7 |
322 | +#define D_l r8 |
323 | +#define D_h r9 |
324 | +#endif |
325 | + |
326 | +/* Number of lines ahead to pre-fetch data. If you change this the code |
327 | + below will need adjustment to compensate. */ |
328 | + |
329 | +#define prefetch_lines 5 |
330 | + |
331 | +#ifdef USE_VFP |
332 | + .macro cpy_line_vfp vreg, base |
333 | + vstr \vreg, [dst, #\base] |
334 | + vldr \vreg, [src, #\base] |
335 | + vstr d0, [dst, #\base + 8] |
336 | + vldr d0, [src, #\base + 8] |
337 | + vstr d1, [dst, #\base + 16] |
338 | + vldr d1, [src, #\base + 16] |
339 | + vstr d2, [dst, #\base + 24] |
340 | + vldr d2, [src, #\base + 24] |
341 | + vstr \vreg, [dst, #\base + 32] |
342 | + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] |
343 | + vstr d0, [dst, #\base + 40] |
344 | + vldr d0, [src, #\base + 40] |
345 | + vstr d1, [dst, #\base + 48] |
346 | + vldr d1, [src, #\base + 48] |
347 | + vstr d2, [dst, #\base + 56] |
348 | + vldr d2, [src, #\base + 56] |
349 | + .endm |
350 | + |
351 | + .macro cpy_tail_vfp vreg, base |
352 | + vstr \vreg, [dst, #\base] |
353 | + vldr \vreg, [src, #\base] |
354 | + vstr d0, [dst, #\base + 8] |
355 | + vldr d0, [src, #\base + 8] |
356 | + vstr d1, [dst, #\base + 16] |
357 | + vldr d1, [src, #\base + 16] |
358 | + vstr d2, [dst, #\base + 24] |
359 | + vldr d2, [src, #\base + 24] |
360 | + vstr \vreg, [dst, #\base + 32] |
361 | + vstr d0, [dst, #\base + 40] |
362 | + vldr d0, [src, #\base + 40] |
363 | + vstr d1, [dst, #\base + 48] |
364 | + vldr d1, [src, #\base + 48] |
365 | + vstr d2, [dst, #\base + 56] |
366 | + vldr d2, [src, #\base + 56] |
367 | + .endm |
368 | +#endif |
369 | + |
370 | + .macro def_fn f p2align=0 |
371 | .text |
372 | - .thumb |
373 | - |
374 | -@ --------------------------------------------------------------------------- |
375 | - .thumb_func |
376 | - .align 2 |
377 | - .p2align 4,,15 |
378 | - .global memcpy |
379 | - .type memcpy,%function |
380 | -memcpy: |
381 | - @ r0 = dest |
382 | - @ r1 = source |
383 | - @ r2 = count |
384 | - @ returns dest in r0 |
385 | - @ Overlaps of source/dest not allowed according to spec |
386 | - @ Note this routine relies on v7 misaligned loads/stores |
387 | - pld [r1] |
388 | - mov r12, r0 @ stash original r0 |
389 | - cmp r2,#32 |
390 | - blt 10f @ take the small copy case separately |
391 | - |
392 | - @ test for either source or destination being misaligned |
393 | - @ (We only rely on word align) |
394 | - @ TODO: Test for co-misalignment |
395 | - tst r0,#3 |
396 | - it eq |
397 | - tsteq r1,#3 |
398 | - bne 30f @ misaligned case |
399 | - |
400 | -4: |
401 | - @ at this point we are word (or better) aligned and have at least |
402 | - @ 32 bytes to play with |
403 | - push {r3,r4,r5,r6,r7,r8,r10,r11} |
404 | -5: |
405 | - ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11} |
406 | - pld [r1,#96] |
407 | - sub r2,r2,#32 |
408 | - cmp r2,#32 |
409 | - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} |
410 | - bge 5b |
411 | - |
412 | - pop {r3,r4,r5,r6,r7,r8,r10,r11} |
413 | - @ We are now down to less than 32 bytes |
414 | - cbz r2,15f @ quick exit for the case where we copied a multiple of 32 |
415 | - |
416 | -10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes) |
417 | - cmp r2,#4 |
418 | - blt 12f |
419 | -11: |
420 | - sub r2,r2,#4 |
421 | - cmp r2,#4 |
422 | - ldr r3, [r1],#4 |
423 | - str r3, [r0],#4 |
424 | - bge 11b |
425 | -12: |
426 | - tst r2,#2 |
427 | - itt ne |
428 | - ldrhne r3, [r1],#2 |
429 | - strhne r3, [r0],#2 |
430 | - |
431 | - tst r2,#1 |
432 | - itt ne |
433 | - ldrbne r3, [r1],#1 |
434 | - strbne r3, [r0],#1 |
435 | - |
436 | -15: @ exit |
437 | - mov r0,r12 @ restore r0 |
438 | - bx lr |
439 | - |
440 | -30: @ non-aligned - at least 32 bytes to play with |
441 | - @ On v7 we're allowed to do ldr's and str's from arbitrary alignments |
442 | - @ but not ldrd/strd or ldm/stm |
443 | - @ Note Neon is often a better choice misaligned using vld1 |
444 | - |
445 | - @ copy a byte at a time until the point where we have an aligned destination |
446 | - @ we know we have enough bytes to go to know we won't run out in this phase |
447 | - tst r0,#7 |
448 | - beq 35f |
449 | - |
450 | -31: |
451 | - ldrb r3,[r1],#1 |
452 | - sub r2,r2,#1 |
453 | - strb r3,[r0],#1 |
454 | - tst r0,#7 |
455 | - bne 31b |
456 | - |
457 | - cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with |
458 | - blt 11b |
459 | - |
460 | - @ Now the store address is aligned |
461 | -35: |
462 | - push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} |
463 | - and r6,r1,#3 @ how misaligned we are |
464 | - cmp r6,#2 |
465 | - cbz r6, 100f @ Go there if we're actually aligned |
466 | - bge 120f @ And here if it's aligned on 2 or 3 byte |
467 | - @ Note might be worth splitting to bgt and a separate beq |
468 | - @ if the branches are well separated |
469 | - |
470 | - @ At this point dest is aligned, source is 1 byte forward |
471 | -110: |
472 | - ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store |
473 | - sub r2,r2,#3 @ Number of bytes left in whole words we can load |
474 | - add r1,r1,#3 @ To aligned load address |
475 | - bic r3,r3,#0xff000000 |
476 | - |
477 | -112: |
478 | - ldmia r1!,{r5,r6,r7,r8} |
479 | - sub r2,r2,#32 |
480 | - cmp r2,#32 |
481 | - pld [r1,#96] |
482 | - |
483 | - orr r3,r3,r5,lsl#24 |
484 | - mov r4,r5,lsr#8 |
485 | - mov r5,r6,lsr#8 |
486 | - orr r4,r4,r6,lsl#24 |
487 | - mov r6,r7,lsr#8 |
488 | - ldmia r1!,{r10,r11,r12,r14} |
489 | - orr r5,r5,r7,lsl#24 |
490 | - mov r7,r8,lsr#8 |
491 | - orr r6,r6,r8,lsl#24 |
492 | - mov r8,r10,lsr#8 |
493 | - orr r7,r7,r10,lsl#24 |
494 | - mov r10,r11,lsr#8 |
495 | - orr r8,r8,r11,lsl#24 |
496 | - orr r10,r10,r12,lsl#24 |
497 | - mov r11,r12,lsr#8 |
498 | - orr r11,r11,r14,lsl#24 |
499 | - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} |
500 | - mov r3,r14,lsr#8 |
501 | - |
502 | - bge 112b |
503 | - |
504 | - @ Deal with the stragglers |
505 | - add r2,r2,#3 |
506 | - sub r1,r1,#3 |
507 | - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} |
508 | - b 10b |
509 | - |
510 | -100: @ Dest and source aligned - must have been originally co-misaligned |
511 | - @ Fallback to main aligned case if still big enough |
512 | - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} |
513 | - b 4b @ Big copies (32 bytes or more) |
514 | - |
515 | -120: @ Dest is aligned, source is align+2 or 3 |
516 | - bgt 130f @ Now split off for 3 byte offset |
517 | - |
518 | - ldrh r3,[r1] |
519 | - sub r2,r2,#2 @ Number of bytes left in whole words we can load |
520 | - add r1,r1,#2 @ To aligned load address |
521 | - |
522 | -122: |
523 | - ldmia r1!,{r5,r6,r7,r8} |
524 | - sub r2,r2,#32 |
525 | - cmp r2,#32 |
526 | - pld [r1,#96] |
527 | - |
528 | - orr r3,r3,r5,lsl#16 |
529 | - mov r4,r5,lsr#16 |
530 | - mov r5,r6,lsr#16 |
531 | - orr r4,r4,r6,lsl#16 |
532 | - mov r6,r7,lsr#16 |
533 | - ldmia r1!,{r10,r11,r12,r14} |
534 | - orr r5,r5,r7,lsl#16 |
535 | - orr r6,r6,r8,lsl#16 |
536 | - mov r7,r8,lsr#16 |
537 | - orr r7,r7,r10,lsl#16 |
538 | - mov r8,r10,lsr#16 |
539 | - orr r8,r8,r11,lsl#16 |
540 | - mov r10,r11,lsr#16 |
541 | - orr r10,r10,r12,lsl#16 |
542 | - mov r11,r12,lsr#16 |
543 | - orr r11,r11,r14,lsl#16 |
544 | - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} |
545 | - mov r3,r14,lsr#16 |
546 | - |
547 | - bge 122b |
548 | - |
549 | - @ Deal with the stragglers |
550 | - add r2,r2,#2 |
551 | - sub r1,r1,#2 |
552 | - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} |
553 | - b 10b |
554 | - |
555 | -130: @ Dest is aligned, source is align+3 |
556 | - ldrb r3,[r1] |
557 | - sub r2,r2,#1 @ Number of bytes left in whole words we can load |
558 | - add r1,r1,#1 @ To aligned load address |
559 | - |
560 | -132: |
561 | - ldmia r1!,{r5,r6,r7,r8} |
562 | - sub r2,r2,#32 |
563 | - cmp r2,#32 |
564 | - pld [r1,#96] |
565 | - |
566 | - orr r3,r3,r5,lsl#8 |
567 | - mov r4,r5,lsr#24 |
568 | - mov r5,r6,lsr#24 |
569 | - orr r4,r4,r6,lsl#8 |
570 | - mov r6,r7,lsr#24 |
571 | - ldmia r1!,{r10,r11,r12,r14} |
572 | - orr r5,r5,r7,lsl#8 |
573 | - mov r7,r8,lsr#24 |
574 | - orr r6,r6,r8,lsl#8 |
575 | - mov r8,r10,lsr#24 |
576 | - orr r7,r7,r10,lsl#8 |
577 | - orr r8,r8,r11,lsl#8 |
578 | - mov r10,r11,lsr#24 |
579 | - orr r10,r10,r12,lsl#8 |
580 | - mov r11,r12,lsr#24 |
581 | - orr r11,r11,r14,lsl#8 |
582 | - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} |
583 | - mov r3,r14,lsr#24 |
584 | - |
585 | - bge 132b |
586 | - |
587 | - @ Deal with the stragglers |
588 | - add r2,r2,#1 |
589 | - sub r1,r1,#1 |
590 | - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} |
591 | - b 10b |
592 | + .p2align \p2align |
593 | + .global \f |
594 | + .type \f, %function |
595 | +\f: |
596 | + .endm |
597 | + |
598 | +def_fn memcpy p2align=6 |
599 | + |
600 | + mov dst, dstin /* Preserve dstin, we need to return it. */ |
601 | + cmp count, #64 |
602 | + bge .Lcpy_not_short |
603 | + /* Deal with small copies quickly by dropping straight into the |
604 | + exit block. */ |
605 | + |
606 | +.Ltail63unaligned: |
607 | +#ifdef USE_NEON |
608 | + and tmp1, count, #0x38 |
609 | + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
610 | + add pc, pc, tmp1 |
611 | + vld1.8 {d0}, [src]! /* 14 words to go. */ |
612 | + vst1.8 {d0}, [dst]! |
613 | + vld1.8 {d0}, [src]! /* 12 words to go. */ |
614 | + vst1.8 {d0}, [dst]! |
615 | + vld1.8 {d0}, [src]! /* 10 words to go. */ |
616 | + vst1.8 {d0}, [dst]! |
617 | + vld1.8 {d0}, [src]! /* 8 words to go. */ |
618 | + vst1.8 {d0}, [dst]! |
619 | + vld1.8 {d0}, [src]! /* 6 words to go. */ |
620 | + vst1.8 {d0}, [dst]! |
621 | + vld1.8 {d0}, [src]! /* 4 words to go. */ |
622 | + vst1.8 {d0}, [dst]! |
623 | + vld1.8 {d0}, [src]! /* 2 words to go. */ |
624 | + vst1.8 {d0}, [dst]! |
625 | + |
626 | + tst count, #4 |
627 | + ldrne tmp1, [src], #4 |
628 | + strne tmp1, [dst], #4 |
629 | +#else |
630 | + /* Copy up to 15 full words of data. May not be aligned. */ |
631 | + /* Cannot use VFP for unaligned data. */ |
632 | + and tmp1, count, #0x3c |
633 | + add dst, dst, tmp1 |
634 | + add src, src, tmp1 |
635 | + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) |
636 | + /* Jump directly into the sequence below at the correct offset. */ |
637 | + add pc, pc, tmp1, lsl #1 |
638 | + |
639 | + ldr tmp1, [src, #-60] /* 15 words to go. */ |
640 | + str tmp1, [dst, #-60] |
641 | + |
642 | + ldr tmp1, [src, #-56] /* 14 words to go. */ |
643 | + str tmp1, [dst, #-56] |
644 | + ldr tmp1, [src, #-52] |
645 | + str tmp1, [dst, #-52] |
646 | + |
647 | + ldr tmp1, [src, #-48] /* 12 words to go. */ |
648 | + str tmp1, [dst, #-48] |
649 | + ldr tmp1, [src, #-44] |
650 | + str tmp1, [dst, #-44] |
651 | + |
652 | + ldr tmp1, [src, #-40] /* 10 words to go. */ |
653 | + str tmp1, [dst, #-40] |
654 | + ldr tmp1, [src, #-36] |
655 | + str tmp1, [dst, #-36] |
656 | + |
657 | + ldr tmp1, [src, #-32] /* 8 words to go. */ |
658 | + str tmp1, [dst, #-32] |
659 | + ldr tmp1, [src, #-28] |
660 | + str tmp1, [dst, #-28] |
661 | + |
662 | + ldr tmp1, [src, #-24] /* 6 words to go. */ |
663 | + str tmp1, [dst, #-24] |
664 | + ldr tmp1, [src, #-20] |
665 | + str tmp1, [dst, #-20] |
666 | + |
667 | + ldr tmp1, [src, #-16] /* 4 words to go. */ |
668 | + str tmp1, [dst, #-16] |
669 | + ldr tmp1, [src, #-12] |
670 | + str tmp1, [dst, #-12] |
671 | + |
672 | + ldr tmp1, [src, #-8] /* 2 words to go. */ |
673 | + str tmp1, [dst, #-8] |
674 | + ldr tmp1, [src, #-4] |
675 | + str tmp1, [dst, #-4] |
676 | +#endif |
677 | + |
678 | + lsls count, count, #31 |
679 | + ldrhcs tmp1, [src], #2 |
680 | + ldrbne src, [src] /* Src is dead, use as a scratch. */ |
681 | + strhcs tmp1, [dst], #2 |
682 | + strbne src, [dst] |
683 | + bx lr |
684 | + |
685 | +.Lcpy_not_short: |
686 | + /* At least 64 bytes to copy, but don't know the alignment yet. */ |
687 | + str tmp2, [sp, #-FRAME_SIZE]! |
688 | + and tmp2, src, #3 |
689 | + and tmp1, dst, #3 |
690 | + cmp tmp1, tmp2 |
691 | + bne .Lcpy_notaligned |
692 | + |
693 | +#ifdef USE_VFP |
694 | + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show |
695 | + that the FP pipeline is much better at streaming loads and |
696 | + stores. This is outside the critical loop. */ |
697 | + vmov.f32 s0, s0 |
698 | +#endif |
699 | + |
700 | + /* SRC and DST have the same mutual 32-bit alignment, but we may |
701 | + still need to pre-copy some bytes to get to natural alignment. |
702 | + We bring DST into full 64-bit alignment. */ |
703 | + lsls tmp2, dst, #29 |
704 | + beq 1f |
705 | + rsbs tmp2, tmp2, #0 |
706 | + sub count, count, tmp2, lsr #29 |
707 | + ldrmi tmp1, [src], #4 |
708 | + strmi tmp1, [dst], #4 |
709 | + lsls tmp2, tmp2, #2 |
710 | + ldrhcs tmp1, [src], #2 |
711 | + ldrbne tmp2, [src], #1 |
712 | + strhcs tmp1, [dst], #2 |
713 | + strbne tmp2, [dst], #1 |
714 | + |
715 | +1: |
716 | + subs tmp2, count, #64 /* Use tmp2 for count. */ |
717 | + blt .Ltail63aligned |
718 | + |
719 | + cmp tmp2, #512 |
720 | + bge .Lcpy_body_long |
721 | + |
722 | +.Lcpy_body_medium: /* Count in tmp2. */ |
723 | +#ifdef USE_VFP |
724 | +1: |
725 | + vldr d0, [src, #0] |
726 | + subs tmp2, tmp2, #64 |
727 | + vldr d1, [src, #8] |
728 | + vstr d0, [dst, #0] |
729 | + vldr d0, [src, #16] |
730 | + vstr d1, [dst, #8] |
731 | + vldr d1, [src, #24] |
732 | + vstr d0, [dst, #16] |
733 | + vldr d0, [src, #32] |
734 | + vstr d1, [dst, #24] |
735 | + vldr d1, [src, #40] |
736 | + vstr d0, [dst, #32] |
737 | + vldr d0, [src, #48] |
738 | + vstr d1, [dst, #40] |
739 | + vldr d1, [src, #56] |
740 | + vstr d0, [dst, #48] |
741 | + add src, src, #64 |
742 | + vstr d1, [dst, #56] |
743 | + add dst, dst, #64 |
744 | + bge 1b |
745 | + tst tmp2, #0x3f |
746 | + beq .Ldone |
747 | + |
748 | +.Ltail63aligned: /* Count in tmp2. */ |
749 | + and tmp1, tmp2, #0x38 |
750 | + add dst, dst, tmp1 |
751 | + add src, src, tmp1 |
752 | + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
753 | + add pc, pc, tmp1 |
754 | + |
755 | + vldr d0, [src, #-56] /* 14 words to go. */ |
756 | + vstr d0, [dst, #-56] |
757 | + vldr d0, [src, #-48] /* 12 words to go. */ |
758 | + vstr d0, [dst, #-48] |
759 | + vldr d0, [src, #-40] /* 10 words to go. */ |
760 | + vstr d0, [dst, #-40] |
761 | + vldr d0, [src, #-32] /* 8 words to go. */ |
762 | + vstr d0, [dst, #-32] |
763 | + vldr d0, [src, #-24] /* 6 words to go. */ |
764 | + vstr d0, [dst, #-24] |
765 | + vldr d0, [src, #-16] /* 4 words to go. */ |
766 | + vstr d0, [dst, #-16] |
767 | + vldr d0, [src, #-8] /* 2 words to go. */ |
768 | + vstr d0, [dst, #-8] |
769 | +#else |
770 | + sub src, src, #8 |
771 | + sub dst, dst, #8 |
772 | +1: |
773 | + ldrd A_l, A_h, [src, #8] |
774 | + strd A_l, A_h, [dst, #8] |
775 | + ldrd A_l, A_h, [src, #16] |
776 | + strd A_l, A_h, [dst, #16] |
777 | + ldrd A_l, A_h, [src, #24] |
778 | + strd A_l, A_h, [dst, #24] |
779 | + ldrd A_l, A_h, [src, #32] |
780 | + strd A_l, A_h, [dst, #32] |
781 | + ldrd A_l, A_h, [src, #40] |
782 | + strd A_l, A_h, [dst, #40] |
783 | + ldrd A_l, A_h, [src, #48] |
784 | + strd A_l, A_h, [dst, #48] |
785 | + ldrd A_l, A_h, [src, #56] |
786 | + strd A_l, A_h, [dst, #56] |
787 | + ldrd A_l, A_h, [src, #64]! |
788 | + strd A_l, A_h, [dst, #64]! |
789 | + subs tmp2, tmp2, #64 |
790 | + bge 1b |
791 | + tst tmp2, #0x3f |
792 | + bne 1f |
793 | + ldr tmp2,[sp], #FRAME_SIZE |
794 | + bx lr |
795 | +1: |
796 | + add src, src, #8 |
797 | + add dst, dst, #8 |
798 | + |
799 | +.Ltail63aligned: /* Count in tmp2. */ |
800 | + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but |
801 | + we know that the src and dest are 32-bit aligned so we can use |
802 | + LDRD/STRD to improve efficiency. */ |
803 | + /* TMP2 is now negative, but we don't care about that. The bottom |
804 | + six bits still tell us how many bytes are left to copy. */ |
805 | + |
806 | + and tmp1, tmp2, #0x38 |
807 | + add dst, dst, tmp1 |
808 | + add src, src, tmp1 |
809 | + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
810 | + add pc, pc, tmp1 |
811 | + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ |
812 | + strd A_l, A_h, [dst, #-56] |
813 | + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ |
814 | + strd A_l, A_h, [dst, #-48] |
815 | + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ |
816 | + strd A_l, A_h, [dst, #-40] |
817 | + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ |
818 | + strd A_l, A_h, [dst, #-32] |
819 | + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ |
820 | + strd A_l, A_h, [dst, #-24] |
821 | + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ |
822 | + strd A_l, A_h, [dst, #-16] |
823 | + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ |
824 | + strd A_l, A_h, [dst, #-8] |
825 | + |
826 | +#endif |
827 | + tst tmp2, #4 |
828 | + ldrne tmp1, [src], #4 |
829 | + strne tmp1, [dst], #4 |
830 | + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
831 | + ldrhcs tmp1, [src], #2 |
832 | + ldrbne tmp2, [src] |
833 | + strhcs tmp1, [dst], #2 |
834 | + strbne tmp2, [dst] |
835 | + |
836 | +.Ldone: |
837 | + ldr tmp2, [sp], #FRAME_SIZE |
838 | + bx lr |
839 | + |
840 | +.Lcpy_body_long: /* Count in tmp2. */ |
841 | + |
842 | + /* Long copy. We know that there's at least (prefetch_lines * 64) |
843 | + bytes to go. */ |
844 | +#ifdef USE_VFP |
845 | + /* Don't use PLD. Instead, read some data in advance of the current |
846 | + copy position into a register. This should act like a PLD |
847 | + operation but we won't have to repeat the transfer. */ |
848 | + |
849 | + vldr d3, [src, #0] |
850 | + vldr d4, [src, #64] |
851 | + vldr d5, [src, #128] |
852 | + vldr d6, [src, #192] |
853 | + vldr d7, [src, #256] |
854 | + |
855 | + vldr d0, [src, #8] |
856 | + vldr d1, [src, #16] |
857 | + vldr d2, [src, #24] |
858 | + add src, src, #32 |
859 | + |
860 | + subs tmp2, tmp2, #prefetch_lines * 64 * 2 |
861 | + blt 2f |
862 | +1: |
863 | + cpy_line_vfp d3, 0 |
864 | + cpy_line_vfp d4, 64 |
865 | + cpy_line_vfp d5, 128 |
866 | + add dst, dst, #3 * 64 |
867 | + add src, src, #3 * 64 |
868 | + cpy_line_vfp d6, 0 |
869 | + cpy_line_vfp d7, 64 |
870 | + add dst, dst, #2 * 64 |
871 | + add src, src, #2 * 64 |
872 | + subs tmp2, tmp2, #prefetch_lines * 64 |
873 | + bge 1b |
874 | + |
875 | +2: |
876 | + cpy_tail_vfp d3, 0 |
877 | + cpy_tail_vfp d4, 64 |
878 | + cpy_tail_vfp d5, 128 |
879 | + add src, src, #3 * 64 |
880 | + add dst, dst, #3 * 64 |
881 | + cpy_tail_vfp d6, 0 |
882 | + vstr d7, [dst, #64] |
883 | + vldr d7, [src, #64] |
884 | + vstr d0, [dst, #64 + 8] |
885 | + vldr d0, [src, #64 + 8] |
886 | + vstr d1, [dst, #64 + 16] |
887 | + vldr d1, [src, #64 + 16] |
888 | + vstr d2, [dst, #64 + 24] |
889 | + vldr d2, [src, #64 + 24] |
890 | + vstr d7, [dst, #64 + 32] |
891 | + add src, src, #96 |
892 | + vstr d0, [dst, #64 + 40] |
893 | + vstr d1, [dst, #64 + 48] |
894 | + vstr d2, [dst, #64 + 56] |
895 | + add dst, dst, #128 |
896 | + add tmp2, tmp2, #prefetch_lines * 64 |
897 | + b .Lcpy_body_medium |
898 | +#else |
899 | + /* Long copy. Use an SMS style loop to maximize the I/O |
900 | + bandwidth of the core. We don't have enough spare registers |
901 | + to synthesise prefetching, so use PLD operations. */ |
902 | + /* Pre-bias src and dst. */ |
903 | + sub src, src, #8 |
904 | + sub dst, dst, #8 |
905 | + pld [src, #8] |
906 | + pld [src, #72] |
907 | + subs tmp2, tmp2, #64 |
908 | + pld [src, #136] |
909 | + ldrd A_l, A_h, [src, #8] |
910 | + strd B_l, B_h, [sp, #8] |
911 | + ldrd B_l, B_h, [src, #16] |
912 | + strd C_l, C_h, [sp, #16] |
913 | + ldrd C_l, C_h, [src, #24] |
914 | + strd D_l, D_h, [sp, #24] |
915 | + pld [src, #200] |
916 | + ldrd D_l, D_h, [src, #32]! |
917 | + b 1f |
918 | + .p2align 6 |
919 | +2: |
920 | + pld [src, #232] |
921 | + strd A_l, A_h, [dst, #40] |
922 | + ldrd A_l, A_h, [src, #40] |
923 | + strd B_l, B_h, [dst, #48] |
924 | + ldrd B_l, B_h, [src, #48] |
925 | + strd C_l, C_h, [dst, #56] |
926 | + ldrd C_l, C_h, [src, #56] |
927 | + strd D_l, D_h, [dst, #64]! |
928 | + ldrd D_l, D_h, [src, #64]! |
929 | + subs tmp2, tmp2, #64 |
930 | +1: |
931 | + strd A_l, A_h, [dst, #8] |
932 | + ldrd A_l, A_h, [src, #8] |
933 | + strd B_l, B_h, [dst, #16] |
934 | + ldrd B_l, B_h, [src, #16] |
935 | + strd C_l, C_h, [dst, #24] |
936 | + ldrd C_l, C_h, [src, #24] |
937 | + strd D_l, D_h, [dst, #32] |
938 | + ldrd D_l, D_h, [src, #32] |
939 | + bcs 2b |
940 | + /* Save the remaining bytes and restore the callee-saved regs. */ |
941 | + strd A_l, A_h, [dst, #40] |
942 | + add src, src, #40 |
943 | + strd B_l, B_h, [dst, #48] |
944 | + ldrd B_l, B_h, [sp, #8] |
945 | + strd C_l, C_h, [dst, #56] |
946 | + ldrd C_l, C_h, [sp, #16] |
947 | + strd D_l, D_h, [dst, #64] |
948 | + ldrd D_l, D_h, [sp, #24] |
949 | + add dst, dst, #72 |
950 | + tst tmp2, #0x3f |
951 | + bne .Ltail63aligned |
952 | + ldr tmp2, [sp], #FRAME_SIZE |
953 | + bx lr |
954 | +#endif |
955 | + |
956 | +.Lcpy_notaligned: |
957 | + pld [src] |
958 | + pld [src, #64] |
959 | + /* There's at least 64 bytes to copy, but there is no mutual |
960 | + alignment. */ |
961 | + /* Bring DST to 64-bit alignment. */ |
962 | + lsls tmp2, dst, #29 |
963 | + pld [src, #(2 * 64)] |
964 | + beq 1f |
965 | + rsbs tmp2, tmp2, #0 |
966 | + sub count, count, tmp2, lsr #29 |
967 | + ldrmi tmp1, [src], #4 |
968 | + strmi tmp1, [dst], #4 |
969 | + lsls tmp2, tmp2, #2 |
970 | + ldrbne tmp1, [src], #1 |
971 | + ldrhcs tmp2, [src], #2 |
972 | + strbne tmp1, [dst], #1 |
973 | + strhcs tmp2, [dst], #2 |
974 | +1: |
975 | + pld [src, #(3 * 64)] |
976 | + subs count, count, #64 |
977 | + ldrmi tmp2, [sp], #FRAME_SIZE |
978 | + bmi .Ltail63unaligned |
979 | + pld [src, #(4 * 64)] |
980 | + |
981 | +#ifdef USE_NEON |
982 | + vld1.8 {d0-d3}, [src]! |
983 | + vld1.8 {d4-d7}, [src]! |
984 | + subs count, count, #64 |
985 | + bmi 2f |
986 | +1: |
987 | + pld [src, #(4 * 64)] |
988 | + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
989 | + vld1.8 {d0-d3}, [src]! |
990 | + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
991 | + vld1.8 {d4-d7}, [src]! |
992 | + subs count, count, #64 |
993 | + bpl 1b |
994 | +2: |
995 | + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
996 | + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
997 | + ands count, count, #0x3f |
998 | +#else |
999 | + /* Use an SMS style loop to maximize the I/O bandwidth. */ |
1000 | + sub src, src, #4 |
1001 | + sub dst, dst, #8 |
1002 | + subs tmp2, count, #64 /* Use tmp2 for count. */ |
1003 | + ldr A_l, [src, #4] |
1004 | + ldr A_h, [src, #8] |
1005 | + strd B_l, B_h, [sp, #8] |
1006 | + ldr B_l, [src, #12] |
1007 | + ldr B_h, [src, #16] |
1008 | + strd C_l, C_h, [sp, #16] |
1009 | + ldr C_l, [src, #20] |
1010 | + ldr C_h, [src, #24] |
1011 | + strd D_l, D_h, [sp, #24] |
1012 | + ldr D_l, [src, #28] |
1013 | + ldr D_h, [src, #32]! |
1014 | + b 1f |
1015 | + .p2align 6 |
1016 | +2: |
1017 | + pld [src, #(5 * 64) - (32 - 4)] |
1018 | + strd A_l, A_h, [dst, #40] |
1019 | + ldr A_l, [src, #36] |
1020 | + ldr A_h, [src, #40] |
1021 | + strd B_l, B_h, [dst, #48] |
1022 | + ldr B_l, [src, #44] |
1023 | + ldr B_h, [src, #48] |
1024 | + strd C_l, C_h, [dst, #56] |
1025 | + ldr C_l, [src, #52] |
1026 | + ldr C_h, [src, #56] |
1027 | + strd D_l, D_h, [dst, #64]! |
1028 | + ldr D_l, [src, #60] |
1029 | + ldr D_h, [src, #64]! |
1030 | + subs tmp2, tmp2, #64 |
1031 | +1: |
1032 | + strd A_l, A_h, [dst, #8] |
1033 | + ldr A_l, [src, #4] |
1034 | + ldr A_h, [src, #8] |
1035 | + strd B_l, B_h, [dst, #16] |
1036 | + ldr B_l, [src, #12] |
1037 | + ldr B_h, [src, #16] |
1038 | + strd C_l, C_h, [dst, #24] |
1039 | + ldr C_l, [src, #20] |
1040 | + ldr C_h, [src, #24] |
1041 | + strd D_l, D_h, [dst, #32] |
1042 | + ldr D_l, [src, #28] |
1043 | + ldr D_h, [src, #32] |
1044 | + bcs 2b |
1045 | + |
1046 | + /* Save the remaining bytes and restore the callee-saved regs. */ |
1047 | + strd A_l, A_h, [dst, #40] |
1048 | + add src, src, #36 |
1049 | + strd B_l, B_h, [dst, #48] |
1050 | + ldrd B_l, B_h, [sp, #8] |
1051 | + strd C_l, C_h, [dst, #56] |
1052 | + ldrd C_l, C_h, [sp, #16] |
1053 | + strd D_l, D_h, [dst, #64] |
1054 | + ldrd D_l, D_h, [sp, #24] |
1055 | + add dst, dst, #72 |
1056 | + ands count, tmp2, #0x3f |
1057 | +#endif |
1058 | + ldr tmp2, [sp], #FRAME_SIZE |
1059 | + bne .Ltail63unaligned |
1060 | + bx lr |
1061 | + |
1062 | + .size memcpy, . - memcpy |