Merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2 into lp:cortex-strings

Proposed by Matthew Gretton-Dann
Status: Merged
Merged at revision: 97
Proposed branch: lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Merge into: lp:cortex-strings
Diff against target: 569 lines (+530/-2)
4 files modified
Makefile.am (+5/-2)
src/aarch64/memcmp.S (+162/-0)
src/aarch64/strnlen.S (+164/-0)
tests/test-strnlen.c (+199/-0)
To merge this branch: bzr merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Reviewer Review Type Date Requested Status
Linaro Toolchain Developers Pending
Review via email: mp+142147@code.launchpad.net
To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'Makefile.am'
2--- Makefile.am 2013-01-07 14:12:23 +0000
3+++ Makefile.am 2013-01-07 16:09:31 +0000
4@@ -44,7 +44,8 @@
5 tests/test-strcmp \
6 tests/test-strcpy \
7 tests/test-strlen \
8- tests/test-strncmp
9+ tests/test-strncmp \
10+ tests/test-strnlen
11
12 # Options for the tests
13 tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
14@@ -266,12 +267,14 @@
15 if HOST_AARCH64
16
17 libcortex_strings_la_SOURCES = \
18+ src/aarch64/memcmp.S \
19 src/aarch64/memcpy.S \
20 src/aarch64/memmove.S \
21 src/aarch64/memset.S \
22 src/aarch64/strcmp.S \
23 src/aarch64/strlen.S \
24- src/aarch64/strncmp.S
25+ src/aarch64/strncmp.S \
26+ src/aarch64/strnlen.S
27
28 endif
29
30
31=== added file 'src/aarch64/memcmp.S'
32--- src/aarch64/memcmp.S 1970-01-01 00:00:00 +0000
33+++ src/aarch64/memcmp.S 2013-01-07 16:09:31 +0000
34@@ -0,0 +1,162 @@
35+/* memcmp - compare memory
36+
37+ Copyright (c) 2013, Linaro Limited
38+ All rights reserved.
39+
40+ Redistribution and use in source and binary forms, with or without
41+ modification, are permitted provided that the following conditions are met:
42+ * Redistributions of source code must retain the above copyright
43+ notice, this list of conditions and the following disclaimer.
44+ * Redistributions in binary form must reproduce the above copyright
45+ notice, this list of conditions and the following disclaimer in the
46+ documentation and/or other materials provided with the distribution.
47+ * Neither the name of the Linaro nor the
48+ names of its contributors may be used to endorse or promote products
49+ derived from this software without specific prior written permission.
50+
51+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
62+
63+/* Assumptions:
64+ *
65+ * ARMv8-a, AArch64
66+ */
67+
68+ .macro def_fn f p2align=0
69+ .text
70+ .p2align \p2align
71+ .global \f
72+ .type \f, %function
73+\f:
74+ .endm
75+
76+/* Parameters and result. */
77+#define src1 x0
78+#define src2 x1
79+#define limit x2
80+#define result x0
81+
82+/* Internal variables. */
83+#define data1 x3
84+#define data1w w3
85+#define data2 x4
86+#define data2w w4
87+#define has_nul x5
88+#define diff x6
89+#define endloop x7
90+#define tmp1 x8
91+#define tmp2 x9
92+#define tmp3 x10
93+#define pos x11
94+#define limit_wd x12
95+#define mask x13
96+
97+def_fn memcmp p2align=6
98+ cbz limit, .Lret0
99+ eor tmp1, src1, src2
100+ tst tmp1, #7
101+ b.ne .Lmisaligned8
102+ ands tmp1, src1, #7
103+ b.ne .Lmutual_align
104+ add limit_wd, limit, #7
105+ lsr limit_wd, limit_wd, #3
106+ /* Start of performance-critical section -- one 64B cache line. */
107+.Lloop_aligned:
108+ ldr data1, [src1], #8
109+ ldr data2, [src2], #8
110+.Lstart_realigned:
111+ subs limit_wd, limit_wd, #1
112+ eor diff, data1, data2 /* Non-zero if differences found. */
113+ csinv endloop, diff, xzr, ne /* Last Dword or differences. */
114+ cbz endloop, .Lloop_aligned
115+ /* End of performance-critical section -- one 64B cache line. */
116+
117+ /* Not reached the limit, must have found a diff. */
118+ cbnz limit_wd, .Lnot_limit
119+
120+ /* Limit % 8 == 0 => all bytes significant. */
121+ ands limit, limit, #7
122+ b.eq .Lnot_limit
123+
124+ lsl limit, limit, #3 /* Bits -> bytes. */
125+ mov mask, #~0
126+#ifdef __AARCH64EB__
127+ lsr mask, mask, limit
128+#else
129+ lsl mask, mask, limit
130+#endif
131+ bic data1, data1, mask
132+ bic data2, data2, mask
133+
134+ orr diff, diff, mask
135+.Lnot_limit:
136+
137+#ifndef __AARCH64EB__
138+ rev diff, diff
139+ rev data1, data1
140+ rev data2, data2
141+#endif
142+ /* The MS-non-zero bit of DIFF marks either the first bit
143+ that is different, or the end of the significant data.
144+ Shifting left now will bring the critical information into the
145+ top bits. */
146+ clz pos, diff
147+ lsl data1, data1, pos
148+ lsl data2, data2, pos
149+ /* But we need to zero-extend (char is unsigned) the value and then
150+ perform a signed 32-bit subtraction. */
151+ lsr data1, data1, #56
152+ sub result, data1, data2, lsr #56
153+ ret
154+
155+.Lmutual_align:
156+ /* Sources are mutually aligned, but are not currently at an
157+ alignment boundary. Round down the addresses and then mask off
158+ the bytes that precede the start point. */
159+ bic src1, src1, #7
160+ bic src2, src2, #7
161+ add limit, limit, tmp1 /* Adjust the limit for the extra. */
162+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
163+ ldr data1, [src1], #8
164+ neg tmp1, tmp1 /* Bits to alignment -64. */
165+ ldr data2, [src2], #8
166+ mov tmp2, #~0
167+#ifdef __AARCH64EB__
168+ /* Big-endian. Early bytes are at MSB. */
169+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
170+#else
171+ /* Little-endian. Early bytes are at LSB. */
172+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
173+#endif
174+ add limit_wd, limit, #7
175+ orr data1, data1, tmp2
176+ orr data2, data2, tmp2
177+ lsr limit_wd, limit_wd, #3
178+ b .Lstart_realigned
179+
180+.Lret0:
181+ mov result, #0
182+ ret
183+
184+ .p2align 6
185+.Lmisaligned8:
186+ sub limit, limit, #1
187+1:
188+ /* Perhaps we can do better than this. */
189+ ldrb data1w, [src1], #1
190+ ldrb data2w, [src2], #1
191+ subs limit, limit, #1
192+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
193+ b.eq 1b
194+ sub result, data1, data2
195+ ret
196+ .size memcmp, . - memcmp
197
198=== added file 'src/aarch64/strnlen.S'
199--- src/aarch64/strnlen.S 1970-01-01 00:00:00 +0000
200+++ src/aarch64/strnlen.S 2013-01-07 16:09:31 +0000
201@@ -0,0 +1,164 @@
202+/* strnlen - calculate the length of a string with limit.
203+
204+ Copyright (c) 2013, Linaro Limited
205+ All rights reserved.
206+
207+ Redistribution and use in source and binary forms, with or without
208+ modification, are permitted provided that the following conditions are met:
209+ * Redistributions of source code must retain the above copyright
210+ notice, this list of conditions and the following disclaimer.
211+ * Redistributions in binary form must reproduce the above copyright
212+ notice, this list of conditions and the following disclaimer in the
213+ documentation and/or other materials provided with the distribution.
214+ * Neither the name of the Linaro nor the
215+ names of its contributors may be used to endorse or promote products
216+ derived from this software without specific prior written permission.
217+
218+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
219+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
220+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
221+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
222+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
223+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
224+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
225+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
226+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
227+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
228+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
229+
230+/* Assumptions:
231+ *
232+ * ARMv8-a, AArch64
233+ */
234+
235+/* Arguments and results. */
236+#define srcin x0
237+#define len x0
238+#define limit x1
239+
240+/* Locals and temporaries. */
241+#define src x2
242+#define data1 x3
243+#define data2 x4
244+#define data2a x5
245+#define has_nul1 x6
246+#define has_nul2 x7
247+#define tmp1 x8
248+#define tmp2 x9
249+#define tmp3 x10
250+#define tmp4 x11
251+#define zeroones x12
252+#define pos x13
253+#define limit_wd x14
254+
255+ .macro def_fn f p2align=0
256+ .text
257+ .p2align \p2align
258+ .global \f
259+ .type \f, %function
260+\f:
261+ .endm
262+
263+#define REP8_01 0x0101010101010101
264+#define REP8_7f 0x7f7f7f7f7f7f7f7f
265+#define REP8_80 0x8080808080808080
266+
267+ .text
268+ .p2align 6
269+.Lstart:
270+ /* Pre-pad to ensure critical loop begins an icache line. */
271+ .rep 7
272+ nop
273+ .endr
274+ /* Put this code here to avoid wasting more space with pre-padding. */
275+.Lhit_limit:
276+ mov len, limit
277+ ret
278+
279+def_fn strnlen
280+ cbz limit, .Lhit_limit
281+ mov zeroones, #REP8_01
282+ bic src, srcin, #15
283+ ands tmp1, srcin, #15
284+ b.ne .Lmisaligned
285+ add limit_wd, limit, #15
286+ lsr limit_wd, limit_wd, #4
287+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
288+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
289+ can be done in parallel across the entire word. */
290+ /* The inner loop deals with two Dwords at a time. This has a
291+ slightly higher start-up cost, but we should win quite quickly,
292+ especially on cores with a high number of issue slots per
293+ cycle, as we get much better parallelism out of the operations. */
294+
295+ /* Start of critial section -- keep to one 64Byte cache line. */
296+.Lloop:
297+ ldp data1, data2, [src], #16
298+.Lrealigned:
299+ sub tmp1, data1, zeroones
300+ orr tmp2, data1, #REP8_7f
301+ sub tmp3, data2, zeroones
302+ orr tmp4, data2, #REP8_7f
303+ bic has_nul1, tmp1, tmp2
304+ bic has_nul2, tmp3, tmp4
305+ subs limit_wd, limit_wd, #1
306+ orr tmp1, has_nul1, has_nul2
307+ ccmp tmp1, #0, #0, ne /* NZCV = 0000 */
308+ b.eq .Lloop
309+ /* End of critical section -- keep to one 64Byte cache line. */
310+
311+ orr tmp1, has_nul1, has_nul2
312+ cbz tmp1, .Lhit_limit /* No null in final Qword. */
313+
314+ /* We know there's a null in the final Qword. The easiest thing
315+ to do now is work out the length of the string and return
316+ MIN (len, limit). */
317+
318+ sub len, src, srcin
319+ cbz has_nul1, .Lnul_in_data2
320+#ifdef __AARCH64EB__
321+ mov data2, data1
322+#endif
323+ sub len, len, #8
324+ mov has_nul2, has_nul1
325+.Lnul_in_data2:
326+#ifdef __AARCH64EB__
327+ /* For big-endian, carry propagation (if the final byte in the
328+ string is 0x01) means we cannot use has_nul directly. The
329+ easiest way to get the correct byte is to byte-swap the data
330+ and calculate the syndrome a second time. */
331+ rev data2, data2
332+ sub tmp1, data2, zeroones
333+ orr tmp2, data2, #REP8_7f
334+ bic has_nul2, tmp1, tmp2
335+#endif
336+ sub len, len, #8
337+ rev has_nul2, has_nul2
338+ clz pos, has_nul2
339+ add len, len, pos, lsr #3 /* Bits to bytes. */
340+ cmp len, limit
341+ csel len, len, limit, ls /* Return the lower value. */
342+ ret
343+
344+.Lmisaligned:
345+ add tmp3, limit, tmp1
346+ cmp tmp1, #8
347+ neg tmp1, tmp1
348+ ldp data1, data2, [src], #16
349+ add limit_wd, tmp3, #15
350+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
351+ mov tmp2, #~0
352+ lsr limit_wd, limit_wd, #4
353+#ifdef __AARCH64EB__
354+ /* Big-endian. Early bytes are at MSB. */
355+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
356+#else
357+ /* Little-endian. Early bytes are at LSB. */
358+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
359+#endif
360+ orr data1, data1, tmp2
361+ orr data2a, data2, tmp2
362+ csinv data1, data1, xzr, le
363+ csel data2, data2, data2a, le
364+ b .Lrealigned
365+ .size strnlen, . - .Lstart /* Include pre-padding in size. */
366
367=== added file 'tests/test-strnlen.c'
368--- tests/test-strnlen.c 1970-01-01 00:00:00 +0000
369+++ tests/test-strnlen.c 2013-01-07 16:09:31 +0000
370@@ -0,0 +1,199 @@
371+/* Test and measure strlen functions.
372+ Copyright (C) 1999-2012 Free Software Foundation, Inc.
373+ This file is part of the GNU C Library.
374+ Written by Jakub Jelinek <jakub@redhat.com>, 1999.
375+
376+ The GNU C Library is free software; you can redistribute it and/or
377+ modify it under the terms of the GNU Lesser General Public
378+ License as published by the Free Software Foundation; either
379+ version 2.1 of the License, or (at your option) any later version.
380+
381+ The GNU C Library is distributed in the hope that it will be useful,
382+ but WITHOUT ANY WARRANTY; without even the implied warranty of
383+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
384+ Lesser General Public License for more details.
385+
386+ You should have received a copy of the GNU Lesser General Public
387+ License along with the GNU C Library; if not, see
388+ <http://www.gnu.org/licenses/>. */
389+
390+#define TEST_MAIN
391+#define TEST_NAME "strnlen"
392+#include "test-string.h"
393+
394+#define MIN(a,b) ((a) < (b) ? (a) : (b))
395+
396+typedef size_t (*proto_t) (const char *, size_t);
397+size_t simple_strnlen (const char *, size_t);
398+
399+IMPL (simple_strnlen, 0)
400+IMPL (strnlen, 1)
401+
402+size_t
403+simple_strnlen (const char *s, size_t maxlen)
404+{
405+ size_t i;
406+
407+ for (i = 0; i < maxlen && s[i]; ++i);
408+ return i;
409+}
410+
411+static void
412+do_one_test (impl_t *impl, const char *s, size_t maxlen, size_t exp_len)
413+{
414+ size_t len = CALL (impl, s, maxlen);
415+ if (len != exp_len)
416+ {
417+ error (0, 0, "Wrong result in function %s %zd %zd", impl->name,
418+ len, exp_len);
419+ ret = 1;
420+ return;
421+ }
422+
423+ if (HP_TIMING_AVAIL)
424+ {
425+ hp_timing_t start __attribute ((unused));
426+ hp_timing_t stop __attribute ((unused));
427+ hp_timing_t best_time = ~ (hp_timing_t) 0;
428+ size_t i;
429+
430+ for (i = 0; i < 32; ++i)
431+ {
432+ HP_TIMING_NOW (start);
433+ CALL (impl, s, maxlen);
434+ HP_TIMING_NOW (stop);
435+ HP_TIMING_BEST (best_time, start, stop);
436+ }
437+
438+ printf ("\t%zd", (size_t) best_time);
439+ }
440+}
441+
442+static void
443+do_test (size_t align, size_t len, size_t maxlen, int max_char)
444+{
445+ size_t i;
446+
447+ align &= 7;
448+ if (align + len >= page_size)
449+ return;
450+
451+ for (i = 0; i < len; ++i)
452+ buf1[align + i] = 1 + 7 * i % max_char;
453+ buf1[align + len] = 0;
454+
455+ if (HP_TIMING_AVAIL)
456+ printf ("Length %4zd, alignment %2zd:", len, align);
457+
458+ FOR_EACH_IMPL (impl, 0)
459+ do_one_test (impl, (char *) (buf1 + align), maxlen, MIN (len, maxlen));
460+
461+ if (HP_TIMING_AVAIL)
462+ putchar ('\n');
463+}
464+
465+static void
466+do_random_tests (void)
467+{
468+ size_t i, j, n, align, len;
469+ unsigned char *p = buf1 + page_size - 512;
470+
471+ for (n = 0; n < ITERATIONS; n++)
472+ {
473+ align = random () & 15;
474+ len = random () & 511;
475+ if (len + align > 510)
476+ len = 511 - align - (random () & 7);
477+ j = len + align + 64;
478+ if (j > 512)
479+ j = 512;
480+
481+ for (i = 0; i < j; i++)
482+ {
483+ if (i == len + align)
484+ p[i] = 0;
485+ else
486+ {
487+ p[i] = random () & 255;
488+ if (i >= align && i < len + align && !p[i])
489+ p[i] = (random () & 127) + 1;
490+ }
491+ }
492+
493+ FOR_EACH_IMPL (impl, 1)
494+ {
495+ if (len > 0
496+ && CALL (impl, (char *) (p + align), len - 1) != len - 1)
497+ {
498+ error (0, 0, "Iteration %zd (limited) - wrong result in function %s (%zd) %zd != %zd, p %p",
499+ n, impl->name, align,
500+ CALL (impl, (char *) (p + align), len - 1), len - 1, p);
501+ ret = 1;
502+ }
503+ if (CALL (impl, (char *) (p + align), len) != len)
504+ {
505+ error (0, 0, "Iteration %zd (exact) - wrong result in function %s (%zd) %zd != %zd, p %p",
506+ n, impl->name, align,
507+ CALL (impl, (char *) (p + align), len), len, p);
508+ ret = 1;
509+ }
510+ if (CALL (impl, (char *) (p + align), len + 1) != len)
511+ {
512+ error (0, 0, "Iteration %zd (long) - wrong result in function %s (%zd) %zd != %zd, p %p",
513+ n, impl->name, align,
514+ CALL (impl, (char *) (p + align), len + 1), len, p);
515+ ret = 1;
516+ }
517+ }
518+ }
519+}
520+
521+int
522+test_main (void)
523+{
524+ size_t i;
525+
526+ test_init ();
527+
528+ printf ("%20s", "");
529+ FOR_EACH_IMPL (impl, 0)
530+ printf ("\t%s", impl->name);
531+ putchar ('\n');
532+
533+ for (i = 1; i < 8; ++i)
534+ {
535+ do_test (0, i, i - 1, 127);
536+ do_test (0, i, i, 127);
537+ do_test (0, i, i + 1, 127);
538+ }
539+
540+ for (i = 1; i < 8; ++i)
541+ {
542+ do_test (i, i, i - 1, 127);
543+ do_test (i, i, i, 127);
544+ do_test (i, i, i + 1, 127);
545+ }
546+
547+ for (i = 2; i <= 10; ++i)
548+ {
549+ do_test (0, 1 << i, 5000, 127);
550+ do_test (1, 1 << i, 5000, 127);
551+ }
552+
553+ for (i = 1; i < 8; ++i)
554+ do_test (0, i, 5000, 255);
555+
556+ for (i = 1; i < 8; ++i)
557+ do_test (i, i, 5000, 255);
558+
559+ for (i = 2; i <= 10; ++i)
560+ {
561+ do_test (0, 1 << i, 5000, 255);
562+ do_test (1, 1 << i, 5000, 255);
563+ }
564+
565+ do_random_tests ();
566+ return ret;
567+}
568+
569+#include "test-skeleton.c"

Subscribers

People subscribed via source and target branches