Merge lp:~will-newton/cortex-strings/cortex-strings into lp:cortex-strings

Proposed by Will Newton
Status: Merged
Approved by: Will Newton
Approved revision: 100
Merged at revision: 100
Proposed branch: lp:~will-newton/cortex-strings/cortex-strings
Merge into: lp:cortex-strings
Diff against target: 1062 lines (+604/-399)
4 files modified
Makefile.am (+11/-11)
configure.ac (+8/-0)
src/linaro-a9/memcpy-hybrid.S (+0/-152)
src/linaro-a9/memcpy.S (+585/-236)
To merge this branch: bzr merge lp:~will-newton/cortex-strings/cortex-strings
Reviewer Review Type Date Requested Status
Will Newton (community) Approve
Review via email: mp+155445@code.launchpad.net

Description of the change

Integrate new NEON/VFP/ARM memcpy implementation.

To post a comment you must log in.
Revision history for this message
Will Newton (will-newton) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'Makefile.am'
2--- Makefile.am 2013-01-07 15:57:20 +0000
3+++ Makefile.am 2013-03-26 10:23:26 +0000
4@@ -171,15 +171,15 @@
5
6 if WITH_NEON
7 # Pull in the NEON specific files
8-neon_sources = \
9- src/linaro-a9/memcpy-hybrid.S
10 neon_bionic_sources = \
11 reference/bionic/memcpy.S
12-neon_cppflags = -mfpu=neon
13-neon_dirs = neon
14-else
15-alternate_sources = \
16- src/linaro-a9/memcpy.S
17+fpu_flags = -mfpu=neon
18+else
19+if WITH_VFP
20+fpu_flags = -mfpu=vfp
21+else
22+fpu_flags = -msoft-float
23+endif
24 endif
25
26 # Benchmarks and example programs
27@@ -200,13 +200,12 @@
28
29 # Main library
30 libcortex_strings_la_SOURCES = \
31- $(neon_sources) \
32- $(alternate_sources) \
33 src/thumb-2/strcpy.c \
34 src/linaro-a9/memchr.S \
35 src/linaro-a9/strchr.S \
36 src/linaro-a9/strlen.S \
37- src/linaro-a9/memset.S
38+ src/linaro-a9/memset.S \
39+ src/linaro-a9/memcpy.S
40
41 # Libraries containing the difference reference versions
42 libbionic_a_SOURCES = \
43@@ -259,7 +258,8 @@
44 try_newlib_xscale_SOURCES =
45 try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt
46
47-AM_CPPFLAGS = $(neon_cppflags)
48+AM_CPPFLAGS = $(fpu_flags)
49+AM_LDFLAGS = $(fpu_flags)
50
51 endif
52
53
54=== modified file 'configure.ac'
55--- configure.ac 2012-12-12 02:30:07 +0000
56+++ configure.ac 2013-03-26 10:23:26 +0000
57@@ -77,4 +77,12 @@
58 AC_SUBST(with_neon)
59 AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes)
60
61+AC_ARG_WITH([vfp],
62+ AC_HELP_STRING([--with-vfp],
63+ [include VFP specific routines @<:@default=yes@:>@]),
64+ [with_vfp=$withval],
65+ [with_vfp=yes])
66+AC_SUBST(with_vfp)
67+AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes)
68+
69 AC_OUTPUT
70
71=== removed file 'src/linaro-a9/memcpy-hybrid.S'
72--- src/linaro-a9/memcpy-hybrid.S 2011-09-08 17:20:49 +0000
73+++ src/linaro-a9/memcpy-hybrid.S 1970-01-01 00:00:00 +0000
74@@ -1,152 +0,0 @@
75-/* Copyright (c) 2010-2011, Linaro Limited
76- All rights reserved.
77-
78- Redistribution and use in source and binary forms, with or without
79- modification, are permitted provided that the following conditions
80- are met:
81-
82- * Redistributions of source code must retain the above copyright
83- notice, this list of conditions and the following disclaimer.
84-
85- * Redistributions in binary form must reproduce the above copyright
86- notice, this list of conditions and the following disclaimer in the
87- documentation and/or other materials provided with the distribution.
88-
89- * Neither the name of Linaro Limited nor the names of its
90- contributors may be used to endorse or promote products derived
91- from this software without specific prior written permission.
92-
93- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
94- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
95- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
96- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
97- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
98- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
99- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
100- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
101- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
102- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
103- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104-
105- Written by Dave Gilbert <david.gilbert@linaro.org>
106-
107- This memcpy routine is optimised on a Cortex-A9 and should work on
108- all ARMv7 processors with NEON. */
109-
110-@ 2011-09-01 david.gilbert@linaro.org
111-@ Extracted from local git 2f11b436
112-
113- .syntax unified
114- .arch armv7-a
115-
116-@ this lets us check a flag in a 00/ff byte easily in either endianness
117-#ifdef __ARMEB__
118-#define CHARTSTMASK(c) 1<<(31-(c*8))
119-#else
120-#define CHARTSTMASK(c) 1<<(c*8)
121-#endif
122- .text
123- .thumb
124-
125-@ ---------------------------------------------------------------------------
126- .thumb_func
127- .align 2
128- .p2align 4,,15
129- .global memcpy
130- .type memcpy,%function
131-memcpy:
132- @ r0 = dest
133- @ r1 = source
134- @ r2 = count
135- @ returns dest in r0
136- @ Overlaps of source/dest not allowed according to spec
137- @ Note this routine relies on v7 misaligned loads/stores
138- pld [r1]
139- mov r12, r0 @ stash original r0
140- cmp r2,#32
141- blt 10f @ take the small copy case separately
142-
143- @ test for either source or destination being misaligned
144- @ (We only rely on word align)
145- tst r0,#3
146- it eq
147- tsteq r1,#3
148- bne 30f @ misaligned case
149-
150-4:
151- @ at this point we are word (or better) aligned and have at least
152- @ 32 bytes to play with
153-
154- @ If it's a huge copy, try Neon
155- cmp r2, #128*1024
156- bge 35f @ Sharing general non-aligned case here, aligned could be faster
157-
158- push {r3,r4,r5,r6,r7,r8,r10,r11}
159-5:
160- ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
161- sub r2,r2,#32
162- pld [r1,#96]
163- cmp r2,#32
164- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
165- bge 5b
166-
167- pop {r3,r4,r5,r6,r7,r8,r10,r11}
168- @ We are now down to less than 32 bytes
169- cbz r2,15f @ quick exit for the case where we copied a multiple of 32
170-
171-10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
172- cmp r2,#4
173- blt 12f
174-11:
175- sub r2,r2,#4
176- cmp r2,#4
177- ldr r3, [r1],#4
178- str r3, [r0],#4
179- bge 11b
180-12:
181- tst r2,#2
182- itt ne
183- ldrhne r3, [r1],#2
184- strhne r3, [r0],#2
185-
186- tst r2,#1
187- itt ne
188- ldrbne r3, [r1],#1
189- strbne r3, [r0],#1
190-
191-15: @ exit
192- mov r0,r12 @ restore r0
193- bx lr
194-
195- .align 2
196- .p2align 4,,15
197-30: @ non-aligned - at least 32 bytes to play with
198- @ Test for co-misalignment
199- eor r3, r0, r1
200- tst r3,#3
201- beq 50f
202-
203- @ Use Neon for misaligned
204-35:
205- vld1.8 {d0,d1,d2,d3}, [r1]!
206- sub r2,r2,#32
207- cmp r2,#32
208- pld [r1,#96]
209- vst1.8 {d0,d1,d2,d3}, [r0]!
210- bge 35b
211- b 10b @ TODO: Probably a bad idea to switch to ARM at this point
212-
213- .align 2
214- .p2align 4,,15
215-50: @ Co-misaligned
216- @ At this point we've got at least 32 bytes
217-51:
218- ldrb r3,[r1],#1
219- sub r2,r2,#1
220- strb r3,[r0],#1
221- tst r0,#7
222- bne 51b
223-
224- cmp r2,#32
225- blt 10b
226- b 4b
227
228=== modified file 'src/linaro-a9/memcpy.S'
229--- src/linaro-a9/memcpy.S 2011-09-09 00:20:15 +0000
230+++ src/linaro-a9/memcpy.S 2013-03-26 10:23:26 +0000
231@@ -1,4 +1,4 @@
232-/* Copyright (c) 2010-2011, Linaro Limited
233+/* Copyright (c) 2013, Linaro Limited
234 All rights reserved.
235
236 Redistribution and use in source and binary forms, with or without
237@@ -28,241 +28,590 @@
238 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
239 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
240
241- Written by Dave Gilbert <david.gilbert@linaro.org>
242-
243- This memcpy routine is optimised on a Cortex-A9 and should work on
244- all ARMv7 processors. */
245-
246-@ 2011-09-01 david.gilbert@linaro.org
247-@ Extracted from local git 2f11b436
248+ This memcpy routine is optimised for Cortex-A cores and takes advantage
249+ of VFP or NEON when built with the appropriate flags.
250+
251+ Assumptions:
252+
253+ ARMv6 (ARMv7-a if using Neon)
254+ ARM state
255+ Unaligned accesses
256+ LDRD/STRD support unaligned word accesses
257+ Not tested on big-endian
258+
259+ */
260
261 .syntax unified
262- .arch armv7-a
263-
264-@ this lets us check a flag in a 00/ff byte easily in either endianness
265-#ifdef __ARMEB__
266-#define CHARTSTMASK(c) 1<<(31-(c*8))
267-#else
268-#define CHARTSTMASK(c) 1<<(c*8)
269-#endif
270+ /* This implementation requires ARM state. */
271+ .arm
272+
273+#ifdef __ARM_NEON__
274+
275+ .fpu neon
276+ .arch armv7-a
277+# define FRAME_SIZE 4
278+# define USE_VFP
279+# define USE_NEON
280+
281+#elif !defined (__SOFTFP__)
282+
283+ .arch armv6
284+ .fpu vfpv2
285+# define FRAME_SIZE 32
286+# define USE_VFP
287+
288+#else
289+ .arch armv6
290+# define FRAME_SIZE 32
291+
292+#endif
293+
294+/* Old versions of GAS incorrectly implement the NEON align semantics. */
295+#ifdef BROKEN_ASM_NEON_ALIGN
296+#define ALIGN(addr, align) addr,:align
297+#else
298+#define ALIGN(addr, align) addr:align
299+#endif
300+
301+#define PC_OFFSET 8 /* PC pipeline compensation. */
302+#define INSN_SIZE 4
303+
304+/* Call parameters. */
305+#define dstin r0
306+#define src r1
307+#define count r2
308+
309+/* Locals. */
310+#define tmp1 r3
311+#define dst ip
312+#define tmp2 r10
313+
314+#ifndef USE_NEON
315+/* For bulk copies using GP registers. */
316+#define A_l r2 /* Call-clobbered. */
317+#define A_h r3 /* Call-clobbered. */
318+#define B_l r4
319+#define B_h r5
320+#define C_l r6
321+#define C_h r7
322+#define D_l r8
323+#define D_h r9
324+#endif
325+
326+/* Number of lines ahead to pre-fetch data. If you change this the code
327+ below will need adjustment to compensate. */
328+
329+#define prefetch_lines 5
330+
331+#ifdef USE_VFP
332+ .macro cpy_line_vfp vreg, base
333+ vstr \vreg, [dst, #\base]
334+ vldr \vreg, [src, #\base]
335+ vstr d0, [dst, #\base + 8]
336+ vldr d0, [src, #\base + 8]
337+ vstr d1, [dst, #\base + 16]
338+ vldr d1, [src, #\base + 16]
339+ vstr d2, [dst, #\base + 24]
340+ vldr d2, [src, #\base + 24]
341+ vstr \vreg, [dst, #\base + 32]
342+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
343+ vstr d0, [dst, #\base + 40]
344+ vldr d0, [src, #\base + 40]
345+ vstr d1, [dst, #\base + 48]
346+ vldr d1, [src, #\base + 48]
347+ vstr d2, [dst, #\base + 56]
348+ vldr d2, [src, #\base + 56]
349+ .endm
350+
351+ .macro cpy_tail_vfp vreg, base
352+ vstr \vreg, [dst, #\base]
353+ vldr \vreg, [src, #\base]
354+ vstr d0, [dst, #\base + 8]
355+ vldr d0, [src, #\base + 8]
356+ vstr d1, [dst, #\base + 16]
357+ vldr d1, [src, #\base + 16]
358+ vstr d2, [dst, #\base + 24]
359+ vldr d2, [src, #\base + 24]
360+ vstr \vreg, [dst, #\base + 32]
361+ vstr d0, [dst, #\base + 40]
362+ vldr d0, [src, #\base + 40]
363+ vstr d1, [dst, #\base + 48]
364+ vldr d1, [src, #\base + 48]
365+ vstr d2, [dst, #\base + 56]
366+ vldr d2, [src, #\base + 56]
367+ .endm
368+#endif
369+
370+ .macro def_fn f p2align=0
371 .text
372- .thumb
373-
374-@ ---------------------------------------------------------------------------
375- .thumb_func
376- .align 2
377- .p2align 4,,15
378- .global memcpy
379- .type memcpy,%function
380-memcpy:
381- @ r0 = dest
382- @ r1 = source
383- @ r2 = count
384- @ returns dest in r0
385- @ Overlaps of source/dest not allowed according to spec
386- @ Note this routine relies on v7 misaligned loads/stores
387- pld [r1]
388- mov r12, r0 @ stash original r0
389- cmp r2,#32
390- blt 10f @ take the small copy case separately
391-
392- @ test for either source or destination being misaligned
393- @ (We only rely on word align)
394- @ TODO: Test for co-misalignment
395- tst r0,#3
396- it eq
397- tsteq r1,#3
398- bne 30f @ misaligned case
399-
400-4:
401- @ at this point we are word (or better) aligned and have at least
402- @ 32 bytes to play with
403- push {r3,r4,r5,r6,r7,r8,r10,r11}
404-5:
405- ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
406- pld [r1,#96]
407- sub r2,r2,#32
408- cmp r2,#32
409- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
410- bge 5b
411-
412- pop {r3,r4,r5,r6,r7,r8,r10,r11}
413- @ We are now down to less than 32 bytes
414- cbz r2,15f @ quick exit for the case where we copied a multiple of 32
415-
416-10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
417- cmp r2,#4
418- blt 12f
419-11:
420- sub r2,r2,#4
421- cmp r2,#4
422- ldr r3, [r1],#4
423- str r3, [r0],#4
424- bge 11b
425-12:
426- tst r2,#2
427- itt ne
428- ldrhne r3, [r1],#2
429- strhne r3, [r0],#2
430-
431- tst r2,#1
432- itt ne
433- ldrbne r3, [r1],#1
434- strbne r3, [r0],#1
435-
436-15: @ exit
437- mov r0,r12 @ restore r0
438- bx lr
439-
440-30: @ non-aligned - at least 32 bytes to play with
441- @ On v7 we're allowed to do ldr's and str's from arbitrary alignments
442- @ but not ldrd/strd or ldm/stm
443- @ Note Neon is often a better choice misaligned using vld1
444-
445- @ copy a byte at a time until the point where we have an aligned destination
446- @ we know we have enough bytes to go to know we won't run out in this phase
447- tst r0,#7
448- beq 35f
449-
450-31:
451- ldrb r3,[r1],#1
452- sub r2,r2,#1
453- strb r3,[r0],#1
454- tst r0,#7
455- bne 31b
456-
457- cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with
458- blt 11b
459-
460- @ Now the store address is aligned
461-35:
462- push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
463- and r6,r1,#3 @ how misaligned we are
464- cmp r6,#2
465- cbz r6, 100f @ Go there if we're actually aligned
466- bge 120f @ And here if it's aligned on 2 or 3 byte
467- @ Note might be worth splitting to bgt and a separate beq
468- @ if the branches are well separated
469-
470- @ At this point dest is aligned, source is 1 byte forward
471-110:
472- ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store
473- sub r2,r2,#3 @ Number of bytes left in whole words we can load
474- add r1,r1,#3 @ To aligned load address
475- bic r3,r3,#0xff000000
476-
477-112:
478- ldmia r1!,{r5,r6,r7,r8}
479- sub r2,r2,#32
480- cmp r2,#32
481- pld [r1,#96]
482-
483- orr r3,r3,r5,lsl#24
484- mov r4,r5,lsr#8
485- mov r5,r6,lsr#8
486- orr r4,r4,r6,lsl#24
487- mov r6,r7,lsr#8
488- ldmia r1!,{r10,r11,r12,r14}
489- orr r5,r5,r7,lsl#24
490- mov r7,r8,lsr#8
491- orr r6,r6,r8,lsl#24
492- mov r8,r10,lsr#8
493- orr r7,r7,r10,lsl#24
494- mov r10,r11,lsr#8
495- orr r8,r8,r11,lsl#24
496- orr r10,r10,r12,lsl#24
497- mov r11,r12,lsr#8
498- orr r11,r11,r14,lsl#24
499- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
500- mov r3,r14,lsr#8
501-
502- bge 112b
503-
504- @ Deal with the stragglers
505- add r2,r2,#3
506- sub r1,r1,#3
507- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
508- b 10b
509-
510-100: @ Dest and source aligned - must have been originally co-misaligned
511- @ Fallback to main aligned case if still big enough
512- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
513- b 4b @ Big copies (32 bytes or more)
514-
515-120: @ Dest is aligned, source is align+2 or 3
516- bgt 130f @ Now split off for 3 byte offset
517-
518- ldrh r3,[r1]
519- sub r2,r2,#2 @ Number of bytes left in whole words we can load
520- add r1,r1,#2 @ To aligned load address
521-
522-122:
523- ldmia r1!,{r5,r6,r7,r8}
524- sub r2,r2,#32
525- cmp r2,#32
526- pld [r1,#96]
527-
528- orr r3,r3,r5,lsl#16
529- mov r4,r5,lsr#16
530- mov r5,r6,lsr#16
531- orr r4,r4,r6,lsl#16
532- mov r6,r7,lsr#16
533- ldmia r1!,{r10,r11,r12,r14}
534- orr r5,r5,r7,lsl#16
535- orr r6,r6,r8,lsl#16
536- mov r7,r8,lsr#16
537- orr r7,r7,r10,lsl#16
538- mov r8,r10,lsr#16
539- orr r8,r8,r11,lsl#16
540- mov r10,r11,lsr#16
541- orr r10,r10,r12,lsl#16
542- mov r11,r12,lsr#16
543- orr r11,r11,r14,lsl#16
544- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
545- mov r3,r14,lsr#16
546-
547- bge 122b
548-
549- @ Deal with the stragglers
550- add r2,r2,#2
551- sub r1,r1,#2
552- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
553- b 10b
554-
555-130: @ Dest is aligned, source is align+3
556- ldrb r3,[r1]
557- sub r2,r2,#1 @ Number of bytes left in whole words we can load
558- add r1,r1,#1 @ To aligned load address
559-
560-132:
561- ldmia r1!,{r5,r6,r7,r8}
562- sub r2,r2,#32
563- cmp r2,#32
564- pld [r1,#96]
565-
566- orr r3,r3,r5,lsl#8
567- mov r4,r5,lsr#24
568- mov r5,r6,lsr#24
569- orr r4,r4,r6,lsl#8
570- mov r6,r7,lsr#24
571- ldmia r1!,{r10,r11,r12,r14}
572- orr r5,r5,r7,lsl#8
573- mov r7,r8,lsr#24
574- orr r6,r6,r8,lsl#8
575- mov r8,r10,lsr#24
576- orr r7,r7,r10,lsl#8
577- orr r8,r8,r11,lsl#8
578- mov r10,r11,lsr#24
579- orr r10,r10,r12,lsl#8
580- mov r11,r12,lsr#24
581- orr r11,r11,r14,lsl#8
582- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
583- mov r3,r14,lsr#24
584-
585- bge 132b
586-
587- @ Deal with the stragglers
588- add r2,r2,#1
589- sub r1,r1,#1
590- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
591- b 10b
592+ .p2align \p2align
593+ .global \f
594+ .type \f, %function
595+\f:
596+ .endm
597+
598+def_fn memcpy p2align=6
599+
600+ mov dst, dstin /* Preserve dstin, we need to return it. */
601+ cmp count, #64
602+ bge .Lcpy_not_short
603+ /* Deal with small copies quickly by dropping straight into the
604+ exit block. */
605+
606+.Ltail63unaligned:
607+#ifdef USE_NEON
608+ and tmp1, count, #0x38
609+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
610+ add pc, pc, tmp1
611+ vld1.8 {d0}, [src]! /* 14 words to go. */
612+ vst1.8 {d0}, [dst]!
613+ vld1.8 {d0}, [src]! /* 12 words to go. */
614+ vst1.8 {d0}, [dst]!
615+ vld1.8 {d0}, [src]! /* 10 words to go. */
616+ vst1.8 {d0}, [dst]!
617+ vld1.8 {d0}, [src]! /* 8 words to go. */
618+ vst1.8 {d0}, [dst]!
619+ vld1.8 {d0}, [src]! /* 6 words to go. */
620+ vst1.8 {d0}, [dst]!
621+ vld1.8 {d0}, [src]! /* 4 words to go. */
622+ vst1.8 {d0}, [dst]!
623+ vld1.8 {d0}, [src]! /* 2 words to go. */
624+ vst1.8 {d0}, [dst]!
625+
626+ tst count, #4
627+ ldrne tmp1, [src], #4
628+ strne tmp1, [dst], #4
629+#else
630+ /* Copy up to 15 full words of data. May not be aligned. */
631+ /* Cannot use VFP for unaligned data. */
632+ and tmp1, count, #0x3c
633+ add dst, dst, tmp1
634+ add src, src, tmp1
635+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
636+ /* Jump directly into the sequence below at the correct offset. */
637+ add pc, pc, tmp1, lsl #1
638+
639+ ldr tmp1, [src, #-60] /* 15 words to go. */
640+ str tmp1, [dst, #-60]
641+
642+ ldr tmp1, [src, #-56] /* 14 words to go. */
643+ str tmp1, [dst, #-56]
644+ ldr tmp1, [src, #-52]
645+ str tmp1, [dst, #-52]
646+
647+ ldr tmp1, [src, #-48] /* 12 words to go. */
648+ str tmp1, [dst, #-48]
649+ ldr tmp1, [src, #-44]
650+ str tmp1, [dst, #-44]
651+
652+ ldr tmp1, [src, #-40] /* 10 words to go. */
653+ str tmp1, [dst, #-40]
654+ ldr tmp1, [src, #-36]
655+ str tmp1, [dst, #-36]
656+
657+ ldr tmp1, [src, #-32] /* 8 words to go. */
658+ str tmp1, [dst, #-32]
659+ ldr tmp1, [src, #-28]
660+ str tmp1, [dst, #-28]
661+
662+ ldr tmp1, [src, #-24] /* 6 words to go. */
663+ str tmp1, [dst, #-24]
664+ ldr tmp1, [src, #-20]
665+ str tmp1, [dst, #-20]
666+
667+ ldr tmp1, [src, #-16] /* 4 words to go. */
668+ str tmp1, [dst, #-16]
669+ ldr tmp1, [src, #-12]
670+ str tmp1, [dst, #-12]
671+
672+ ldr tmp1, [src, #-8] /* 2 words to go. */
673+ str tmp1, [dst, #-8]
674+ ldr tmp1, [src, #-4]
675+ str tmp1, [dst, #-4]
676+#endif
677+
678+ lsls count, count, #31
679+ ldrhcs tmp1, [src], #2
680+ ldrbne src, [src] /* Src is dead, use as a scratch. */
681+ strhcs tmp1, [dst], #2
682+ strbne src, [dst]
683+ bx lr
684+
685+.Lcpy_not_short:
686+ /* At least 64 bytes to copy, but don't know the alignment yet. */
687+ str tmp2, [sp, #-FRAME_SIZE]!
688+ and tmp2, src, #3
689+ and tmp1, dst, #3
690+ cmp tmp1, tmp2
691+ bne .Lcpy_notaligned
692+
693+#ifdef USE_VFP
694+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
695+ that the FP pipeline is much better at streaming loads and
696+ stores. This is outside the critical loop. */
697+ vmov.f32 s0, s0
698+#endif
699+
700+ /* SRC and DST have the same mutual 32-bit alignment, but we may
701+ still need to pre-copy some bytes to get to natural alignment.
702+ We bring DST into full 64-bit alignment. */
703+ lsls tmp2, dst, #29
704+ beq 1f
705+ rsbs tmp2, tmp2, #0
706+ sub count, count, tmp2, lsr #29
707+ ldrmi tmp1, [src], #4
708+ strmi tmp1, [dst], #4
709+ lsls tmp2, tmp2, #2
710+ ldrhcs tmp1, [src], #2
711+ ldrbne tmp2, [src], #1
712+ strhcs tmp1, [dst], #2
713+ strbne tmp2, [dst], #1
714+
715+1:
716+ subs tmp2, count, #64 /* Use tmp2 for count. */
717+ blt .Ltail63aligned
718+
719+ cmp tmp2, #512
720+ bge .Lcpy_body_long
721+
722+.Lcpy_body_medium: /* Count in tmp2. */
723+#ifdef USE_VFP
724+1:
725+ vldr d0, [src, #0]
726+ subs tmp2, tmp2, #64
727+ vldr d1, [src, #8]
728+ vstr d0, [dst, #0]
729+ vldr d0, [src, #16]
730+ vstr d1, [dst, #8]
731+ vldr d1, [src, #24]
732+ vstr d0, [dst, #16]
733+ vldr d0, [src, #32]
734+ vstr d1, [dst, #24]
735+ vldr d1, [src, #40]
736+ vstr d0, [dst, #32]
737+ vldr d0, [src, #48]
738+ vstr d1, [dst, #40]
739+ vldr d1, [src, #56]
740+ vstr d0, [dst, #48]
741+ add src, src, #64
742+ vstr d1, [dst, #56]
743+ add dst, dst, #64
744+ bge 1b
745+ tst tmp2, #0x3f
746+ beq .Ldone
747+
748+.Ltail63aligned: /* Count in tmp2. */
749+ and tmp1, tmp2, #0x38
750+ add dst, dst, tmp1
751+ add src, src, tmp1
752+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
753+ add pc, pc, tmp1
754+
755+ vldr d0, [src, #-56] /* 14 words to go. */
756+ vstr d0, [dst, #-56]
757+ vldr d0, [src, #-48] /* 12 words to go. */
758+ vstr d0, [dst, #-48]
759+ vldr d0, [src, #-40] /* 10 words to go. */
760+ vstr d0, [dst, #-40]
761+ vldr d0, [src, #-32] /* 8 words to go. */
762+ vstr d0, [dst, #-32]
763+ vldr d0, [src, #-24] /* 6 words to go. */
764+ vstr d0, [dst, #-24]
765+ vldr d0, [src, #-16] /* 4 words to go. */
766+ vstr d0, [dst, #-16]
767+ vldr d0, [src, #-8] /* 2 words to go. */
768+ vstr d0, [dst, #-8]
769+#else
770+ sub src, src, #8
771+ sub dst, dst, #8
772+1:
773+ ldrd A_l, A_h, [src, #8]
774+ strd A_l, A_h, [dst, #8]
775+ ldrd A_l, A_h, [src, #16]
776+ strd A_l, A_h, [dst, #16]
777+ ldrd A_l, A_h, [src, #24]
778+ strd A_l, A_h, [dst, #24]
779+ ldrd A_l, A_h, [src, #32]
780+ strd A_l, A_h, [dst, #32]
781+ ldrd A_l, A_h, [src, #40]
782+ strd A_l, A_h, [dst, #40]
783+ ldrd A_l, A_h, [src, #48]
784+ strd A_l, A_h, [dst, #48]
785+ ldrd A_l, A_h, [src, #56]
786+ strd A_l, A_h, [dst, #56]
787+ ldrd A_l, A_h, [src, #64]!
788+ strd A_l, A_h, [dst, #64]!
789+ subs tmp2, tmp2, #64
790+ bge 1b
791+ tst tmp2, #0x3f
792+ bne 1f
793+ ldr tmp2,[sp], #FRAME_SIZE
794+ bx lr
795+1:
796+ add src, src, #8
797+ add dst, dst, #8
798+
799+.Ltail63aligned: /* Count in tmp2. */
800+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
801+ we know that the src and dest are 32-bit aligned so we can use
802+ LDRD/STRD to improve efficiency. */
803+ /* TMP2 is now negative, but we don't care about that. The bottom
804+ six bits still tell us how many bytes are left to copy. */
805+
806+ and tmp1, tmp2, #0x38
807+ add dst, dst, tmp1
808+ add src, src, tmp1
809+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
810+ add pc, pc, tmp1
811+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
812+ strd A_l, A_h, [dst, #-56]
813+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
814+ strd A_l, A_h, [dst, #-48]
815+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
816+ strd A_l, A_h, [dst, #-40]
817+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
818+ strd A_l, A_h, [dst, #-32]
819+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
820+ strd A_l, A_h, [dst, #-24]
821+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
822+ strd A_l, A_h, [dst, #-16]
823+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
824+ strd A_l, A_h, [dst, #-8]
825+
826+#endif
827+ tst tmp2, #4
828+ ldrne tmp1, [src], #4
829+ strne tmp1, [dst], #4
830+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
831+ ldrhcs tmp1, [src], #2
832+ ldrbne tmp2, [src]
833+ strhcs tmp1, [dst], #2
834+ strbne tmp2, [dst]
835+
836+.Ldone:
837+ ldr tmp2, [sp], #FRAME_SIZE
838+ bx lr
839+
840+.Lcpy_body_long: /* Count in tmp2. */
841+
842+ /* Long copy. We know that there's at least (prefetch_lines * 64)
843+ bytes to go. */
844+#ifdef USE_VFP
845+ /* Don't use PLD. Instead, read some data in advance of the current
846+ copy position into a register. This should act like a PLD
847+ operation but we won't have to repeat the transfer. */
848+
849+ vldr d3, [src, #0]
850+ vldr d4, [src, #64]
851+ vldr d5, [src, #128]
852+ vldr d6, [src, #192]
853+ vldr d7, [src, #256]
854+
855+ vldr d0, [src, #8]
856+ vldr d1, [src, #16]
857+ vldr d2, [src, #24]
858+ add src, src, #32
859+
860+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
861+ blt 2f
862+1:
863+ cpy_line_vfp d3, 0
864+ cpy_line_vfp d4, 64
865+ cpy_line_vfp d5, 128
866+ add dst, dst, #3 * 64
867+ add src, src, #3 * 64
868+ cpy_line_vfp d6, 0
869+ cpy_line_vfp d7, 64
870+ add dst, dst, #2 * 64
871+ add src, src, #2 * 64
872+ subs tmp2, tmp2, #prefetch_lines * 64
873+ bge 1b
874+
875+2:
876+ cpy_tail_vfp d3, 0
877+ cpy_tail_vfp d4, 64
878+ cpy_tail_vfp d5, 128
879+ add src, src, #3 * 64
880+ add dst, dst, #3 * 64
881+ cpy_tail_vfp d6, 0
882+ vstr d7, [dst, #64]
883+ vldr d7, [src, #64]
884+ vstr d0, [dst, #64 + 8]
885+ vldr d0, [src, #64 + 8]
886+ vstr d1, [dst, #64 + 16]
887+ vldr d1, [src, #64 + 16]
888+ vstr d2, [dst, #64 + 24]
889+ vldr d2, [src, #64 + 24]
890+ vstr d7, [dst, #64 + 32]
891+ add src, src, #96
892+ vstr d0, [dst, #64 + 40]
893+ vstr d1, [dst, #64 + 48]
894+ vstr d2, [dst, #64 + 56]
895+ add dst, dst, #128
896+ add tmp2, tmp2, #prefetch_lines * 64
897+ b .Lcpy_body_medium
898+#else
899+ /* Long copy. Use an SMS style loop to maximize the I/O
900+ bandwidth of the core. We don't have enough spare registers
901+ to synthesise prefetching, so use PLD operations. */
902+ /* Pre-bias src and dst. */
903+ sub src, src, #8
904+ sub dst, dst, #8
905+ pld [src, #8]
906+ pld [src, #72]
907+ subs tmp2, tmp2, #64
908+ pld [src, #136]
909+ ldrd A_l, A_h, [src, #8]
910+ strd B_l, B_h, [sp, #8]
911+ ldrd B_l, B_h, [src, #16]
912+ strd C_l, C_h, [sp, #16]
913+ ldrd C_l, C_h, [src, #24]
914+ strd D_l, D_h, [sp, #24]
915+ pld [src, #200]
916+ ldrd D_l, D_h, [src, #32]!
917+ b 1f
918+ .p2align 6
919+2:
920+ pld [src, #232]
921+ strd A_l, A_h, [dst, #40]
922+ ldrd A_l, A_h, [src, #40]
923+ strd B_l, B_h, [dst, #48]
924+ ldrd B_l, B_h, [src, #48]
925+ strd C_l, C_h, [dst, #56]
926+ ldrd C_l, C_h, [src, #56]
927+ strd D_l, D_h, [dst, #64]!
928+ ldrd D_l, D_h, [src, #64]!
929+ subs tmp2, tmp2, #64
930+1:
931+ strd A_l, A_h, [dst, #8]
932+ ldrd A_l, A_h, [src, #8]
933+ strd B_l, B_h, [dst, #16]
934+ ldrd B_l, B_h, [src, #16]
935+ strd C_l, C_h, [dst, #24]
936+ ldrd C_l, C_h, [src, #24]
937+ strd D_l, D_h, [dst, #32]
938+ ldrd D_l, D_h, [src, #32]
939+ bcs 2b
940+ /* Save the remaining bytes and restore the callee-saved regs. */
941+ strd A_l, A_h, [dst, #40]
942+ add src, src, #40
943+ strd B_l, B_h, [dst, #48]
944+ ldrd B_l, B_h, [sp, #8]
945+ strd C_l, C_h, [dst, #56]
946+ ldrd C_l, C_h, [sp, #16]
947+ strd D_l, D_h, [dst, #64]
948+ ldrd D_l, D_h, [sp, #24]
949+ add dst, dst, #72
950+ tst tmp2, #0x3f
951+ bne .Ltail63aligned
952+ ldr tmp2, [sp], #FRAME_SIZE
953+ bx lr
954+#endif
955+
956+.Lcpy_notaligned:
957+ pld [src]
958+ pld [src, #64]
959+ /* There's at least 64 bytes to copy, but there is no mutual
960+ alignment. */
961+ /* Bring DST to 64-bit alignment. */
962+ lsls tmp2, dst, #29
963+ pld [src, #(2 * 64)]
964+ beq 1f
965+ rsbs tmp2, tmp2, #0
966+ sub count, count, tmp2, lsr #29
967+ ldrmi tmp1, [src], #4
968+ strmi tmp1, [dst], #4
969+ lsls tmp2, tmp2, #2
970+ ldrbne tmp1, [src], #1
971+ ldrhcs tmp2, [src], #2
972+ strbne tmp1, [dst], #1
973+ strhcs tmp2, [dst], #2
974+1:
975+ pld [src, #(3 * 64)]
976+ subs count, count, #64
977+ ldrmi tmp2, [sp], #FRAME_SIZE
978+ bmi .Ltail63unaligned
979+ pld [src, #(4 * 64)]
980+
981+#ifdef USE_NEON
982+ vld1.8 {d0-d3}, [src]!
983+ vld1.8 {d4-d7}, [src]!
984+ subs count, count, #64
985+ bmi 2f
986+1:
987+ pld [src, #(4 * 64)]
988+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
989+ vld1.8 {d0-d3}, [src]!
990+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
991+ vld1.8 {d4-d7}, [src]!
992+ subs count, count, #64
993+ bpl 1b
994+2:
995+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
996+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
997+ ands count, count, #0x3f
998+#else
999+ /* Use an SMS style loop to maximize the I/O bandwidth. */
1000+ sub src, src, #4
1001+ sub dst, dst, #8
1002+ subs tmp2, count, #64 /* Use tmp2 for count. */
1003+ ldr A_l, [src, #4]
1004+ ldr A_h, [src, #8]
1005+ strd B_l, B_h, [sp, #8]
1006+ ldr B_l, [src, #12]
1007+ ldr B_h, [src, #16]
1008+ strd C_l, C_h, [sp, #16]
1009+ ldr C_l, [src, #20]
1010+ ldr C_h, [src, #24]
1011+ strd D_l, D_h, [sp, #24]
1012+ ldr D_l, [src, #28]
1013+ ldr D_h, [src, #32]!
1014+ b 1f
1015+ .p2align 6
1016+2:
1017+ pld [src, #(5 * 64) - (32 - 4)]
1018+ strd A_l, A_h, [dst, #40]
1019+ ldr A_l, [src, #36]
1020+ ldr A_h, [src, #40]
1021+ strd B_l, B_h, [dst, #48]
1022+ ldr B_l, [src, #44]
1023+ ldr B_h, [src, #48]
1024+ strd C_l, C_h, [dst, #56]
1025+ ldr C_l, [src, #52]
1026+ ldr C_h, [src, #56]
1027+ strd D_l, D_h, [dst, #64]!
1028+ ldr D_l, [src, #60]
1029+ ldr D_h, [src, #64]!
1030+ subs tmp2, tmp2, #64
1031+1:
1032+ strd A_l, A_h, [dst, #8]
1033+ ldr A_l, [src, #4]
1034+ ldr A_h, [src, #8]
1035+ strd B_l, B_h, [dst, #16]
1036+ ldr B_l, [src, #12]
1037+ ldr B_h, [src, #16]
1038+ strd C_l, C_h, [dst, #24]
1039+ ldr C_l, [src, #20]
1040+ ldr C_h, [src, #24]
1041+ strd D_l, D_h, [dst, #32]
1042+ ldr D_l, [src, #28]
1043+ ldr D_h, [src, #32]
1044+ bcs 2b
1045+
1046+ /* Save the remaining bytes and restore the callee-saved regs. */
1047+ strd A_l, A_h, [dst, #40]
1048+ add src, src, #36
1049+ strd B_l, B_h, [dst, #48]
1050+ ldrd B_l, B_h, [sp, #8]
1051+ strd C_l, C_h, [dst, #56]
1052+ ldrd C_l, C_h, [sp, #16]
1053+ strd D_l, D_h, [dst, #64]
1054+ ldrd D_l, D_h, [sp, #24]
1055+ add dst, dst, #72
1056+ ands count, tmp2, #0x3f
1057+#endif
1058+ ldr tmp2, [sp], #FRAME_SIZE
1059+ bne .Ltail63unaligned
1060+ bx lr
1061+
1062+ .size memcpy, . - memcpy

Subscribers

People subscribed via source and target branches