Merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2 into lp:cortex-strings

Proposed by Matthew Gretton-Dann
Status: Merged
Merged at revision: 97
Proposed branch: lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Merge into: lp:cortex-strings
Diff against target: 569 lines (+530/-2)
4 files modified
Makefile.am (+5/-2)
src/aarch64/memcmp.S (+162/-0)
src/aarch64/strnlen.S (+164/-0)
tests/test-strnlen.c (+199/-0)
To merge this branch: bzr merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Reviewer Review Type Date Requested Status
Linaro Toolchain Developers Pending
Review via email: mp+142147@code.launchpad.net
To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'Makefile.am'
--- Makefile.am 2013-01-07 14:12:23 +0000
+++ Makefile.am 2013-01-07 16:09:31 +0000
@@ -44,7 +44,8 @@
44 tests/test-strcmp \44 tests/test-strcmp \
45 tests/test-strcpy \45 tests/test-strcpy \
46 tests/test-strlen \46 tests/test-strlen \
47 tests/test-strncmp47 tests/test-strncmp \
48 tests/test-strnlen
4849
49# Options for the tests50# Options for the tests
50tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)51tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
@@ -266,12 +267,14 @@
266if HOST_AARCH64267if HOST_AARCH64
267268
268libcortex_strings_la_SOURCES = \269libcortex_strings_la_SOURCES = \
270 src/aarch64/memcmp.S \
269 src/aarch64/memcpy.S \271 src/aarch64/memcpy.S \
270 src/aarch64/memmove.S \272 src/aarch64/memmove.S \
271 src/aarch64/memset.S \273 src/aarch64/memset.S \
272 src/aarch64/strcmp.S \274 src/aarch64/strcmp.S \
273 src/aarch64/strlen.S \275 src/aarch64/strlen.S \
274 src/aarch64/strncmp.S276 src/aarch64/strncmp.S \
277 src/aarch64/strnlen.S
275278
276endif279endif
277280
278281
=== added file 'src/aarch64/memcmp.S'
--- src/aarch64/memcmp.S 1970-01-01 00:00:00 +0000
+++ src/aarch64/memcmp.S 2013-01-07 16:09:31 +0000
@@ -0,0 +1,162 @@
1/* memcmp - compare memory
2
3 Copyright (c) 2013, Linaro Limited
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of the Linaro nor the
14 names of its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28
29/* Assumptions:
30 *
31 * ARMv8-a, AArch64
32 */
33
34 .macro def_fn f p2align=0
35 .text
36 .p2align \p2align
37 .global \f
38 .type \f, %function
39\f:
40 .endm
41
42/* Parameters and result. */
43#define src1 x0
44#define src2 x1
45#define limit x2
46#define result x0
47
48/* Internal variables. */
49#define data1 x3
50#define data1w w3
51#define data2 x4
52#define data2w w4
53#define has_nul x5
54#define diff x6
55#define endloop x7
56#define tmp1 x8
57#define tmp2 x9
58#define tmp3 x10
59#define pos x11
60#define limit_wd x12
61#define mask x13
62
63def_fn memcmp p2align=6
64 cbz limit, .Lret0
65 eor tmp1, src1, src2
66 tst tmp1, #7
67 b.ne .Lmisaligned8
68 ands tmp1, src1, #7
69 b.ne .Lmutual_align
70 add limit_wd, limit, #7
71 lsr limit_wd, limit_wd, #3
72 /* Start of performance-critical section -- one 64B cache line. */
73.Lloop_aligned:
74 ldr data1, [src1], #8
75 ldr data2, [src2], #8
76.Lstart_realigned:
77 subs limit_wd, limit_wd, #1
78 eor diff, data1, data2 /* Non-zero if differences found. */
79 csinv endloop, diff, xzr, ne /* Last Dword or differences. */
80 cbz endloop, .Lloop_aligned
81 /* End of performance-critical section -- one 64B cache line. */
82
83 /* Not reached the limit, must have found a diff. */
84 cbnz limit_wd, .Lnot_limit
85
86 /* Limit % 8 == 0 => all bytes significant. */
87 ands limit, limit, #7
88 b.eq .Lnot_limit
89
90 lsl limit, limit, #3 /* Bits -> bytes. */
91 mov mask, #~0
92#ifdef __AARCH64EB__
93 lsr mask, mask, limit
94#else
95 lsl mask, mask, limit
96#endif
97 bic data1, data1, mask
98 bic data2, data2, mask
99
100 orr diff, diff, mask
101.Lnot_limit:
102
103#ifndef __AARCH64EB__
104 rev diff, diff
105 rev data1, data1
106 rev data2, data2
107#endif
108 /* The MS-non-zero bit of DIFF marks either the first bit
109 that is different, or the end of the significant data.
110 Shifting left now will bring the critical information into the
111 top bits. */
112 clz pos, diff
113 lsl data1, data1, pos
114 lsl data2, data2, pos
115 /* But we need to zero-extend (char is unsigned) the value and then
116 perform a signed 32-bit subtraction. */
117 lsr data1, data1, #56
118 sub result, data1, data2, lsr #56
119 ret
120
121.Lmutual_align:
122 /* Sources are mutually aligned, but are not currently at an
123 alignment boundary. Round down the addresses and then mask off
124 the bytes that precede the start point. */
125 bic src1, src1, #7
126 bic src2, src2, #7
127 add limit, limit, tmp1 /* Adjust the limit for the extra. */
128 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
129 ldr data1, [src1], #8
130 neg tmp1, tmp1 /* Bits to alignment -64. */
131 ldr data2, [src2], #8
132 mov tmp2, #~0
133#ifdef __AARCH64EB__
134 /* Big-endian. Early bytes are at MSB. */
135 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
136#else
137 /* Little-endian. Early bytes are at LSB. */
138 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
139#endif
140 add limit_wd, limit, #7
141 orr data1, data1, tmp2
142 orr data2, data2, tmp2
143 lsr limit_wd, limit_wd, #3
144 b .Lstart_realigned
145
146.Lret0:
147 mov result, #0
148 ret
149
150 .p2align 6
151.Lmisaligned8:
152 sub limit, limit, #1
1531:
154 /* Perhaps we can do better than this. */
155 ldrb data1w, [src1], #1
156 ldrb data2w, [src2], #1
157 subs limit, limit, #1
158 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
159 b.eq 1b
160 sub result, data1, data2
161 ret
162 .size memcmp, . - memcmp
0163
=== added file 'src/aarch64/strnlen.S'
--- src/aarch64/strnlen.S 1970-01-01 00:00:00 +0000
+++ src/aarch64/strnlen.S 2013-01-07 16:09:31 +0000
@@ -0,0 +1,164 @@
1/* strnlen - calculate the length of a string with limit.
2
3 Copyright (c) 2013, Linaro Limited
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of the Linaro nor the
14 names of its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28
29/* Assumptions:
30 *
31 * ARMv8-a, AArch64
32 */
33
34/* Arguments and results. */
35#define srcin x0
36#define len x0
37#define limit x1
38
39/* Locals and temporaries. */
40#define src x2
41#define data1 x3
42#define data2 x4
43#define data2a x5
44#define has_nul1 x6
45#define has_nul2 x7
46#define tmp1 x8
47#define tmp2 x9
48#define tmp3 x10
49#define tmp4 x11
50#define zeroones x12
51#define pos x13
52#define limit_wd x14
53
54 .macro def_fn f p2align=0
55 .text
56 .p2align \p2align
57 .global \f
58 .type \f, %function
59\f:
60 .endm
61
62#define REP8_01 0x0101010101010101
63#define REP8_7f 0x7f7f7f7f7f7f7f7f
64#define REP8_80 0x8080808080808080
65
66 .text
67 .p2align 6
68.Lstart:
69 /* Pre-pad to ensure critical loop begins an icache line. */
70 .rep 7
71 nop
72 .endr
73 /* Put this code here to avoid wasting more space with pre-padding. */
74.Lhit_limit:
75 mov len, limit
76 ret
77
78def_fn strnlen
79 cbz limit, .Lhit_limit
80 mov zeroones, #REP8_01
81 bic src, srcin, #15
82 ands tmp1, srcin, #15
83 b.ne .Lmisaligned
84 add limit_wd, limit, #15
85 lsr limit_wd, limit_wd, #4
86 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
87 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
88 can be done in parallel across the entire word. */
89 /* The inner loop deals with two Dwords at a time. This has a
90 slightly higher start-up cost, but we should win quite quickly,
91 especially on cores with a high number of issue slots per
92 cycle, as we get much better parallelism out of the operations. */
93
94 /* Start of critial section -- keep to one 64Byte cache line. */
95.Lloop:
96 ldp data1, data2, [src], #16
97.Lrealigned:
98 sub tmp1, data1, zeroones
99 orr tmp2, data1, #REP8_7f
100 sub tmp3, data2, zeroones
101 orr tmp4, data2, #REP8_7f
102 bic has_nul1, tmp1, tmp2
103 bic has_nul2, tmp3, tmp4
104 subs limit_wd, limit_wd, #1
105 orr tmp1, has_nul1, has_nul2
106 ccmp tmp1, #0, #0, ne /* NZCV = 0000 */
107 b.eq .Lloop
108 /* End of critical section -- keep to one 64Byte cache line. */
109
110 orr tmp1, has_nul1, has_nul2
111 cbz tmp1, .Lhit_limit /* No null in final Qword. */
112
113 /* We know there's a null in the final Qword. The easiest thing
114 to do now is work out the length of the string and return
115 MIN (len, limit). */
116
117 sub len, src, srcin
118 cbz has_nul1, .Lnul_in_data2
119#ifdef __AARCH64EB__
120 mov data2, data1
121#endif
122 sub len, len, #8
123 mov has_nul2, has_nul1
124.Lnul_in_data2:
125#ifdef __AARCH64EB__
126 /* For big-endian, carry propagation (if the final byte in the
127 string is 0x01) means we cannot use has_nul directly. The
128 easiest way to get the correct byte is to byte-swap the data
129 and calculate the syndrome a second time. */
130 rev data2, data2
131 sub tmp1, data2, zeroones
132 orr tmp2, data2, #REP8_7f
133 bic has_nul2, tmp1, tmp2
134#endif
135 sub len, len, #8
136 rev has_nul2, has_nul2
137 clz pos, has_nul2
138 add len, len, pos, lsr #3 /* Bits to bytes. */
139 cmp len, limit
140 csel len, len, limit, ls /* Return the lower value. */
141 ret
142
143.Lmisaligned:
144 add tmp3, limit, tmp1
145 cmp tmp1, #8
146 neg tmp1, tmp1
147 ldp data1, data2, [src], #16
148 add limit_wd, tmp3, #15
149 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
150 mov tmp2, #~0
151 lsr limit_wd, limit_wd, #4
152#ifdef __AARCH64EB__
153 /* Big-endian. Early bytes are at MSB. */
154 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
155#else
156 /* Little-endian. Early bytes are at LSB. */
157 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
158#endif
159 orr data1, data1, tmp2
160 orr data2a, data2, tmp2
161 csinv data1, data1, xzr, le
162 csel data2, data2, data2a, le
163 b .Lrealigned
164 .size strnlen, . - .Lstart /* Include pre-padding in size. */
0165
=== added file 'tests/test-strnlen.c'
--- tests/test-strnlen.c 1970-01-01 00:00:00 +0000
+++ tests/test-strnlen.c 2013-01-07 16:09:31 +0000
@@ -0,0 +1,199 @@
1/* Test and measure strlen functions.
2 Copyright (C) 1999-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Jakub Jelinek <jakub@redhat.com>, 1999.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#define TEST_MAIN
21#define TEST_NAME "strnlen"
22#include "test-string.h"
23
24#define MIN(a,b) ((a) < (b) ? (a) : (b))
25
26typedef size_t (*proto_t) (const char *, size_t);
27size_t simple_strnlen (const char *, size_t);
28
29IMPL (simple_strnlen, 0)
30IMPL (strnlen, 1)
31
32size_t
33simple_strnlen (const char *s, size_t maxlen)
34{
35 size_t i;
36
37 for (i = 0; i < maxlen && s[i]; ++i);
38 return i;
39}
40
41static void
42do_one_test (impl_t *impl, const char *s, size_t maxlen, size_t exp_len)
43{
44 size_t len = CALL (impl, s, maxlen);
45 if (len != exp_len)
46 {
47 error (0, 0, "Wrong result in function %s %zd %zd", impl->name,
48 len, exp_len);
49 ret = 1;
50 return;
51 }
52
53 if (HP_TIMING_AVAIL)
54 {
55 hp_timing_t start __attribute ((unused));
56 hp_timing_t stop __attribute ((unused));
57 hp_timing_t best_time = ~ (hp_timing_t) 0;
58 size_t i;
59
60 for (i = 0; i < 32; ++i)
61 {
62 HP_TIMING_NOW (start);
63 CALL (impl, s, maxlen);
64 HP_TIMING_NOW (stop);
65 HP_TIMING_BEST (best_time, start, stop);
66 }
67
68 printf ("\t%zd", (size_t) best_time);
69 }
70}
71
72static void
73do_test (size_t align, size_t len, size_t maxlen, int max_char)
74{
75 size_t i;
76
77 align &= 7;
78 if (align + len >= page_size)
79 return;
80
81 for (i = 0; i < len; ++i)
82 buf1[align + i] = 1 + 7 * i % max_char;
83 buf1[align + len] = 0;
84
85 if (HP_TIMING_AVAIL)
86 printf ("Length %4zd, alignment %2zd:", len, align);
87
88 FOR_EACH_IMPL (impl, 0)
89 do_one_test (impl, (char *) (buf1 + align), maxlen, MIN (len, maxlen));
90
91 if (HP_TIMING_AVAIL)
92 putchar ('\n');
93}
94
95static void
96do_random_tests (void)
97{
98 size_t i, j, n, align, len;
99 unsigned char *p = buf1 + page_size - 512;
100
101 for (n = 0; n < ITERATIONS; n++)
102 {
103 align = random () & 15;
104 len = random () & 511;
105 if (len + align > 510)
106 len = 511 - align - (random () & 7);
107 j = len + align + 64;
108 if (j > 512)
109 j = 512;
110
111 for (i = 0; i < j; i++)
112 {
113 if (i == len + align)
114 p[i] = 0;
115 else
116 {
117 p[i] = random () & 255;
118 if (i >= align && i < len + align && !p[i])
119 p[i] = (random () & 127) + 1;
120 }
121 }
122
123 FOR_EACH_IMPL (impl, 1)
124 {
125 if (len > 0
126 && CALL (impl, (char *) (p + align), len - 1) != len - 1)
127 {
128 error (0, 0, "Iteration %zd (limited) - wrong result in function %s (%zd) %zd != %zd, p %p",
129 n, impl->name, align,
130 CALL (impl, (char *) (p + align), len - 1), len - 1, p);
131 ret = 1;
132 }
133 if (CALL (impl, (char *) (p + align), len) != len)
134 {
135 error (0, 0, "Iteration %zd (exact) - wrong result in function %s (%zd) %zd != %zd, p %p",
136 n, impl->name, align,
137 CALL (impl, (char *) (p + align), len), len, p);
138 ret = 1;
139 }
140 if (CALL (impl, (char *) (p + align), len + 1) != len)
141 {
142 error (0, 0, "Iteration %zd (long) - wrong result in function %s (%zd) %zd != %zd, p %p",
143 n, impl->name, align,
144 CALL (impl, (char *) (p + align), len + 1), len, p);
145 ret = 1;
146 }
147 }
148 }
149}
150
151int
152test_main (void)
153{
154 size_t i;
155
156 test_init ();
157
158 printf ("%20s", "");
159 FOR_EACH_IMPL (impl, 0)
160 printf ("\t%s", impl->name);
161 putchar ('\n');
162
163 for (i = 1; i < 8; ++i)
164 {
165 do_test (0, i, i - 1, 127);
166 do_test (0, i, i, 127);
167 do_test (0, i, i + 1, 127);
168 }
169
170 for (i = 1; i < 8; ++i)
171 {
172 do_test (i, i, i - 1, 127);
173 do_test (i, i, i, 127);
174 do_test (i, i, i + 1, 127);
175 }
176
177 for (i = 2; i <= 10; ++i)
178 {
179 do_test (0, 1 << i, 5000, 127);
180 do_test (1, 1 << i, 5000, 127);
181 }
182
183 for (i = 1; i < 8; ++i)
184 do_test (0, i, 5000, 255);
185
186 for (i = 1; i < 8; ++i)
187 do_test (i, i, 5000, 255);
188
189 for (i = 2; i <= 10; ++i)
190 {
191 do_test (0, 1 << i, 5000, 255);
192 do_test (1, 1 << i, 5000, 255);
193 }
194
195 do_random_tests ();
196 return ret;
197}
198
199#include "test-skeleton.c"

Subscribers

People subscribed via source and target branches