Cortex String Routines

Overview
Code
Bugs
Blueprints
Translations
Answers

Merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2 into lp:cortex-strings

aarch64-additions-2
Merge into trunk

Proposed by Matthew Gretton-Dann on 2013-01-07

Status:	Merged
Merged at revision:	97
Proposed branch:	lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Merge into:	lp:cortex-strings
Diff against target:	569 lines (+530/-2) 4 files modified Makefile.am (+5/-2) src/aarch64/memcmp.S (+162/-0) src/aarch64/strnlen.S (+164/-0) tests/test-strnlen.c (+199/-0)
To merge this branch:	bzr merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Linaro Toolchain Developers		2013-01-07	Pending
Review via email: mp+142147@code.launchpad.net

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Download diff
Side-by-side diff

Subscribers

People subscribed via source and target branches

to all changes:

Linaro Toolchain Developers

Matthew Gretton-Dann

Cortex String Routines

Merge lp:~matthew-gretton-dann/cortex-strings/aarch64-additions-2 into lp:cortex-strings

Commit message

Description of the change

Preview Diff

Subscribers

 === modified file 'Makefile.am'
 --- Makefile.am	2013-01-07 14:12:23 +0000
 +++ Makefile.am	2013-01-07 16:09:31 +0000
@@ -44,7 +44,8 @@
  	tests/test-strcmp \
  	tests/test-strcpy \
  	tests/test-strlen \
--	tests/test-strncmp
++	tests/test-strncmp \
++	tests/test-strnlen
  # Options for the tests
  tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
@@ -266,12 +267,14 @@
  if HOST_AARCH64
  libcortex_strings_la_SOURCES = \
++	src/aarch64/memcmp.S \
  	src/aarch64/memcpy.S \
  	src/aarch64/memmove.S \
  	src/aarch64/memset.S \
  	src/aarch64/strcmp.S \
  	src/aarch64/strlen.S \
--	src/aarch64/strncmp.S
++	src/aarch64/strncmp.S \
++	src/aarch64/strnlen.S
  endif
 === added file 'src/aarch64/memcmp.S'
 --- src/aarch64/memcmp.S	1970-01-01 00:00:00 +0000
 +++ src/aarch64/memcmp.S	2013-01-07 16:09:31 +0000
@@ -0,0 +1,162 @@
++/* memcmp - compare memory
++
++   Copyright (c) 2013, Linaro Limited
++   All rights reserved.
++
++   Redistribution and use in source and binary forms, with or without
++   modification, are permitted provided that the following conditions are met:
++       * Redistributions of source code must retain the above copyright
++         notice, this list of conditions and the following disclaimer.
++       * Redistributions in binary form must reproduce the above copyright
++         notice, this list of conditions and the following disclaimer in the
++         documentation and/or other materials provided with the distribution.
++       * Neither the name of the Linaro nor the
++         names of its contributors may be used to endorse or promote products
++         derived from this software without specific prior written permission.
++
++   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
++
++/* Assumptions:
++ *
++ * ARMv8-a, AArch64
++ */
++
++	.macro def_fn f p2align=0
++	.text
++	.p2align \p2align
++	.global \f
++	.type \f, %function
++\f:
++	.endm
++
++/* Parameters and result.  */
++#define src1		x0
++#define src2		x1
++#define limit		x2
++#define result		x0
++
++/* Internal variables.  */
++#define data1		x3
++#define data1w		w3
++#define data2		x4
++#define data2w		w4
++#define has_nul		x5
++#define diff		x6
++#define endloop		x7
++#define tmp1		x8
++#define tmp2		x9
++#define tmp3		x10
++#define pos		x11
++#define limit_wd	x12
++#define mask		x13
++
++def_fn memcmp p2align=6
++	cbz	limit, .Lret0
++	eor	tmp1, src1, src2
++	tst	tmp1, #7
++	b.ne	.Lmisaligned8
++	ands	tmp1, src1, #7
++	b.ne	.Lmutual_align
++	add	limit_wd, limit, #7
++	lsr	limit_wd, limit_wd, #3
++	/* Start of performance-critical section  -- one 64B cache line.  */
++.Lloop_aligned:
++	ldr	data1, [src1], #8
++	ldr	data2, [src2], #8
++.Lstart_realigned:
++	subs	limit_wd, limit_wd, #1
++	eor	diff, data1, data2	/* Non-zero if differences found.  */
++	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
++	cbz	endloop, .Lloop_aligned
++	/* End of performance-critical section  -- one 64B cache line.  */
++
++	/* Not reached the limit, must have found a diff.  */
++	cbnz	limit_wd, .Lnot_limit
++
++	/* Limit % 8 == 0 => all bytes significant.  */
++	ands	limit, limit, #7
++	b.eq	.Lnot_limit
++
++	lsl	limit, limit, #3	/* Bits -> bytes.  */
++	mov	mask, #~0
++#ifdef __AARCH64EB__
++	lsr	mask, mask, limit
++#else
++	lsl	mask, mask, limit
++#endif
++	bic	data1, data1, mask
++	bic	data2, data2, mask
++
++	orr	diff, diff, mask
++.Lnot_limit:
++
++#ifndef	__AARCH64EB__
++	rev	diff, diff
++	rev	data1, data1
++	rev	data2, data2
++#endif
++	/* The MS-non-zero bit of DIFF marks either the first bit
++	   that is different, or the end of the significant data.
++	   Shifting left now will bring the critical information into the
++	   top bits.  */
++	clz	pos, diff
++	lsl	data1, data1, pos
++	lsl	data2, data2, pos
++	/* But we need to zero-extend (char is unsigned) the value and then
++	   perform a signed 32-bit subtraction.  */
++	lsr	data1, data1, #56
++	sub	result, data1, data2, lsr #56
++	ret
++
++.Lmutual_align:
++	/* Sources are mutually aligned, but are not currently at an
++	   alignment boundary.  Round down the addresses and then mask off
++	   the bytes that precede the start point.  */
++	bic	src1, src1, #7
++	bic	src2, src2, #7
++	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
++	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
++	ldr	data1, [src1], #8
++	neg	tmp1, tmp1		/* Bits to alignment -64.  */
++	ldr	data2, [src2], #8
++	mov	tmp2, #~0
++#ifdef __AARCH64EB__
++	/* Big-endian.  Early bytes are at MSB.  */
++	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
++#else
++	/* Little-endian.  Early bytes are at LSB.  */
++	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
++#endif
++	add	limit_wd, limit, #7
++	orr	data1, data1, tmp2
++	orr	data2, data2, tmp2
++	lsr	limit_wd, limit_wd, #3
++	b	.Lstart_realigned
++
++.Lret0:
++	mov	result, #0
++	ret
++
++	.p2align 6
++.Lmisaligned8:
++	sub	limit, limit, #1
++1:
++	/* Perhaps we can do better than this.  */
++	ldrb	data1w, [src1], #1
++	ldrb	data2w, [src2], #1
++	subs	limit, limit, #1
++	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
++	b.eq	1b
++	sub	result, data1, data2
++	ret
++	.size memcmp, . - memcmp
 === added file 'src/aarch64/strnlen.S'
 --- src/aarch64/strnlen.S	1970-01-01 00:00:00 +0000
 +++ src/aarch64/strnlen.S	2013-01-07 16:09:31 +0000
@@ -0,0 +1,164 @@
++/* strnlen - calculate the length of a string with limit.
++
++   Copyright (c) 2013, Linaro Limited
++   All rights reserved.
++
++   Redistribution and use in source and binary forms, with or without
++   modification, are permitted provided that the following conditions are met:
++       * Redistributions of source code must retain the above copyright
++         notice, this list of conditions and the following disclaimer.
++       * Redistributions in binary form must reproduce the above copyright
++         notice, this list of conditions and the following disclaimer in the
++         documentation and/or other materials provided with the distribution.
++       * Neither the name of the Linaro nor the
++         names of its contributors may be used to endorse or promote products
++         derived from this software without specific prior written permission.
++
++   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
++
++/* Assumptions:
++ *
++ * ARMv8-a, AArch64
++ */
++
++/* Arguments and results.  */
++#define srcin		x0
++#define len		x0
++#define limit		x1
++
++/* Locals and temporaries.  */
++#define src		x2
++#define data1		x3
++#define data2		x4
++#define data2a		x5
++#define has_nul1	x6
++#define has_nul2	x7
++#define tmp1		x8
++#define tmp2		x9
++#define tmp3		x10
++#define tmp4		x11
++#define zeroones	x12
++#define pos		x13
++#define limit_wd	x14
++
++	.macro def_fn f p2align=0
++	.text
++	.p2align \p2align
++	.global \f
++	.type \f, %function
++\f:
++	.endm
++
++#define REP8_01 0x0101010101010101
++#define REP8_7f 0x7f7f7f7f7f7f7f7f
++#define REP8_80 0x8080808080808080
++
++	.text
++	.p2align	6
++.Lstart:
++	/* Pre-pad to ensure critical loop begins an icache line.  */
++	.rep 7
++	nop
++	.endr
++	/* Put this code here to avoid wasting more space with pre-padding.  */
++.Lhit_limit:
++	mov	len, limit
++	ret
++
++def_fn strnlen
++	cbz	limit, .Lhit_limit
++	mov	zeroones, #REP8_01
++	bic	src, srcin, #15
++	ands	tmp1, srcin, #15
++	b.ne	.Lmisaligned
++	add	limit_wd, limit, #15
++	lsr	limit_wd, limit_wd, #4
++	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
++	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
++	   can be done in parallel across the entire word.  */
++	/* The inner loop deals with two Dwords at a time.  This has a
++	   slightly higher start-up cost, but we should win quite quickly,
++	   especially on cores with a high number of issue slots per
++	   cycle, as we get much better parallelism out of the operations.  */
++
++	/* Start of critial section -- keep to one 64Byte cache line.  */
++.Lloop:
++	ldp	data1, data2, [src], #16
++.Lrealigned:
++	sub	tmp1, data1, zeroones
++	orr	tmp2, data1, #REP8_7f
++	sub	tmp3, data2, zeroones
++	orr	tmp4, data2, #REP8_7f
++	bic	has_nul1, tmp1, tmp2
++	bic	has_nul2, tmp3, tmp4
++	subs	limit_wd, limit_wd, #1
++	orr	tmp1, has_nul1, has_nul2
++	ccmp	tmp1, #0, #0, ne	/* NZCV = 0000  */
++	b.eq	.Lloop
++	/* End of critical section -- keep to one 64Byte cache line.  */
++
++	orr	tmp1, has_nul1, has_nul2
++	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
++
++	/* We know there's a null in the final Qword.  The easiest thing
++	   to do now is work out the length of the string and return
++	   MIN (len, limit).  */
++
++	sub	len, src, srcin
++	cbz	has_nul1, .Lnul_in_data2
++#ifdef __AARCH64EB__
++	mov	data2, data1
++#endif
++	sub	len, len, #8
++	mov	has_nul2, has_nul1
++.Lnul_in_data2:
++#ifdef __AARCH64EB__
++	/* For big-endian, carry propagation (if the final byte in the
++	   string is 0x01) means we cannot use has_nul directly.  The
++	   easiest way to get the correct byte is to byte-swap the data
++	   and calculate the syndrome a second time.  */
++	rev	data2, data2
++	sub	tmp1, data2, zeroones
++	orr	tmp2, data2, #REP8_7f
++	bic	has_nul2, tmp1, tmp2
++#endif
++	sub	len, len, #8
++	rev	has_nul2, has_nul2
++	clz	pos, has_nul2
++	add	len, len, pos, lsr #3		/* Bits to bytes.  */
++	cmp	len, limit
++	csel	len, len, limit, ls		/* Return the lower value.  */
++	ret
++
++.Lmisaligned:
++	add	tmp3, limit, tmp1
++	cmp	tmp1, #8
++	neg	tmp1, tmp1
++	ldp	data1, data2, [src], #16
++	add	limit_wd, tmp3, #15
++	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
++	mov	tmp2, #~0
++	lsr	limit_wd, limit_wd, #4
++#ifdef __AARCH64EB__
++	/* Big-endian.  Early bytes are at MSB.  */
++	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
++#else
++	/* Little-endian.  Early bytes are at LSB.  */
++	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
++#endif
++	orr	data1, data1, tmp2
++	orr	data2a, data2, tmp2
++	csinv	data1, data1, xzr, le
++	csel	data2, data2, data2a, le
++	b	.Lrealigned
++	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */
 === added file 'tests/test-strnlen.c'
 --- tests/test-strnlen.c	1970-01-01 00:00:00 +0000
 +++ tests/test-strnlen.c	2013-01-07 16:09:31 +0000
@@ -0,0 +1,199 @@
++/* Test and measure strlen functions.
++   Copyright (C) 1999-2012 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++   Written by Jakub Jelinek <jakub@redhat.com>, 1999.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#define TEST_NAME "strnlen"
++#include "test-string.h"
++
++#define MIN(a,b) ((a) < (b) ? (a) : (b))
++
++typedef size_t (*proto_t) (const char *, size_t);
++size_t simple_strnlen (const char *, size_t);
++
++IMPL (simple_strnlen, 0)
++IMPL (strnlen, 1)
++
++size_t
++simple_strnlen (const char *s, size_t maxlen)
++{
++  size_t i;
++
++  for (i = 0; i < maxlen && s[i]; ++i);
++  return i;
++}
++
++static void
++do_one_test (impl_t *impl, const char *s, size_t maxlen, size_t exp_len)
++{
++  size_t len = CALL (impl, s, maxlen);
++  if (len != exp_len)
++    {
++      error (0, 0, "Wrong result in function %s %zd %zd", impl->name,
++	     len, exp_len);
++      ret = 1;
++      return;
++    }
++
++  if (HP_TIMING_AVAIL)
++    {
++      hp_timing_t start __attribute ((unused));
++      hp_timing_t stop __attribute ((unused));
++      hp_timing_t best_time = ~ (hp_timing_t) 0;
++      size_t i;
++
++      for (i = 0; i < 32; ++i)
++	{
++	  HP_TIMING_NOW (start);
++	  CALL (impl, s, maxlen);
++	  HP_TIMING_NOW (stop);
++	  HP_TIMING_BEST (best_time, start, stop);
++	}
++
++      printf ("\t%zd", (size_t) best_time);
++    }
++}
++
++static void
++do_test (size_t align, size_t len, size_t maxlen, int max_char)
++{
++  size_t i;
++
++  align &= 7;
++  if (align + len >= page_size)
++    return;
++
++  for (i = 0; i < len; ++i)
++    buf1[align + i] = 1 + 7 * i % max_char;
++  buf1[align + len] = 0;
++
++  if (HP_TIMING_AVAIL)
++    printf ("Length %4zd, alignment %2zd:", len, align);
++
++  FOR_EACH_IMPL (impl, 0)
++    do_one_test (impl, (char *) (buf1 + align), maxlen, MIN (len, maxlen));
++
++  if (HP_TIMING_AVAIL)
++    putchar ('\n');
++}
++
++static void
++do_random_tests (void)
++{
++  size_t i, j, n, align, len;
++  unsigned char *p = buf1 + page_size - 512;
++
++  for (n = 0; n < ITERATIONS; n++)
++    {
++      align = random () & 15;
++      len = random () & 511;
++      if (len + align > 510)
++	len = 511 - align - (random () & 7);
++      j = len + align + 64;
++      if (j > 512)
++	j = 512;
++
++      for (i = 0; i < j; i++)
++	{
++	  if (i == len + align)
++	    p[i] = 0;
++	  else
++	    {
++	      p[i] = random () & 255;
++	      if (i >= align && i < len + align && !p[i])
++		p[i] = (random () & 127) + 1;
++	    }
++	}
++
++      FOR_EACH_IMPL (impl, 1)
++	{
++	  if (len > 0
++	      && CALL (impl, (char *) (p + align), len - 1) != len - 1)
++	    {
++	      error (0, 0, "Iteration %zd (limited) - wrong result in function %s (%zd) %zd != %zd, p %p",
++		     n, impl->name, align,
++		     CALL (impl, (char *) (p + align), len - 1), len - 1, p);
++	      ret = 1;
++	    }
++	  if (CALL (impl, (char *) (p + align), len) != len)
++	    {
++	      error (0, 0, "Iteration %zd (exact) - wrong result in function %s (%zd) %zd != %zd, p %p",
++		     n, impl->name, align,
++		     CALL (impl, (char *) (p + align), len), len, p);
++	      ret = 1;
++	    }
++	  if (CALL (impl, (char *) (p + align), len + 1) != len)
++	    {
++	      error (0, 0, "Iteration %zd (long) - wrong result in function %s (%zd) %zd != %zd, p %p",
++		     n, impl->name, align,
++		     CALL (impl, (char *) (p + align), len + 1), len, p);
++	      ret = 1;
++	    }
++	}
++    }
++}
++
++int
++test_main (void)
++{
++  size_t i;
++
++  test_init ();
++
++  printf ("%20s", "");
++  FOR_EACH_IMPL (impl, 0)
++    printf ("\t%s", impl->name);
++  putchar ('\n');
++
++  for (i = 1; i < 8; ++i)
++    {
++      do_test (0, i, i - 1, 127);
++      do_test (0, i, i, 127);
++      do_test (0, i, i + 1, 127);
++    }
++
++  for (i = 1; i < 8; ++i)
++    {
++      do_test (i, i, i - 1, 127);
++      do_test (i, i, i, 127);
++      do_test (i, i, i + 1, 127);
++    }
++
++  for (i = 2; i <= 10; ++i)
++    {
++      do_test (0, 1 << i, 5000, 127);
++      do_test (1, 1 << i, 5000, 127);
++    }
++
++  for (i = 1; i < 8; ++i)
++    do_test (0, i, 5000, 255);
++
++  for (i = 1; i < 8; ++i)
++    do_test (i, i, 5000, 255);
++
++  for (i = 2; i <= 10; ++i)
++    {
++      do_test (0, 1 << i, 5000, 255);
++      do_test (1, 1 << i, 5000, 255);
++    }
++
++  do_random_tests ();
++  return ret;
++}
++
++#include "test-skeleton.c"