Bug 585538 - Use SIMD UTF8 to UTF16 code on Linux 32-bit. r=khuey, a2.0=bsmedberg

--HG--
extra : rebase_source : 21031b1e8366c00a9c4745e69f206d2358a294e2
This commit is contained in:
Justin Lebar 2010-08-11 16:49:43 -07:00
parent 34c407b464
commit 29ff79fc83
3 changed files with 162 additions and 94 deletions

View File

@ -75,7 +75,22 @@ CPPSRCS += \
nsUnicodeToCP1252.cpp \
nsUnicodeToMacRoman.cpp \
$(NULL)
endif
# Are we targeting x86-32 or x86-64? If so, we want to include the SSE2
# version of nsUTF8ToUnicodeSSE2.cpp.
ifneq (,$(INTEL_ARCHITECTURE))
CPPSRCS += nsUTF8ToUnicodeSSE2.cpp
# nsUTF8ToUnicodeSSE2.cpp uses SSE2 intrinsics, so we need to pass -msse2 if
# we're using gcc. (See bug 585538 comment 12.)
ifdef GNU_CC
nsUTF8ToUnicodeSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
endif
ifdef SOLARIS_SUNPRO_CXX
nsUTF8ToUnicodeSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
endif
endif
ifeq ($(MOZ_WIDGET_TOOLKIT),os2)
@ -96,12 +111,6 @@ endif
endif
endif
ifeq (86,$(findstring 86,$(OS_TEST)))
ifdef SOLARIS_SUNPRO_CXX
nsUTF8ToUnicode.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
endif
endif
EXTRA_DSO_LDOPTS = \
../util/$(LIB_PREFIX)ucvutil_s.$(LIB_SUFFIX) \
$(MOZ_UNICHARUTIL_LIBS) \

View File

@ -35,9 +35,6 @@
*
* ***** END LICENSE BLOCK ***** */
// So SSE.h will include emmintrin.h in an appropriate way:
#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
#include "nsUCSupport.h"
#include "nsUTF8ToUnicode.h"
#include "mozilla/SSE.h"
@ -116,90 +113,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Reset()
// number of bytes left in src and the number of unichars available in
// dst.)
#ifdef MOZILLA_COMPILE_WITH_SSE2
static inline void
Convert_ascii_run (const char *&src,
PRUnichar *&dst,
PRInt32 len)
{
if (len > 15 && mozilla::use_sse2()) {
__m128i in, out1, out2;
__m128d *outp1, *outp2;
__m128i zeroes;
PRUint32 offset;
// align input to 16 bytes
while ((NS_PTR_TO_UINT32(src) & 15) && len > 0) {
if (*src & 0x80U)
return;
*dst++ = (PRUnichar) *src++;
len--;
}
zeroes = _mm_setzero_si128();
offset = NS_PTR_TO_UINT32(dst) & 15;
// Note: all these inner loops have to break, not return; we need
// to let the single-char loop below catch any leftover
// byte-at-a-time ASCII chars, since this function must consume
// all available ASCII chars before it returns
if (offset == 0) {
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_stream_si128((__m128i *) dst, out1);
_mm_stream_si128((__m128i *) (dst + 8), out2);
dst += 16;
src += 16;
len -= 16;
}
} else if (offset == 8) {
outp1 = (__m128d *) &out1;
outp2 = (__m128d *) &out2;
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_storel_epi64((__m128i *) dst, out1);
_mm_storel_epi64((__m128i *) (dst + 8), out2);
_mm_storeh_pd((double *) (dst + 4), *outp1);
_mm_storeh_pd((double *) (dst + 12), *outp2);
src += 16;
dst += 16;
len -= 16;
}
} else {
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_storeu_si128((__m128i *) dst, out1);
_mm_storeu_si128((__m128i *) (dst + 8), out2);
src += 16;
dst += 16;
len -= 16;
}
}
}
// finish off a byte at a time
while (len-- > 0 && (*src & 0x80U) == 0) {
*dst++ = (PRUnichar) *src++;
}
}
#elif defined(__arm__) || defined(_M_ARM)
#if defined(__arm__) || defined(_M_ARM)
// on ARM, do extra work to avoid byte/halfword reads/writes by
// reading/writing a word at a time for as long as we can
@ -256,13 +170,30 @@ finish:
}
}
#else /* generic code */
#else
#ifdef MOZILLA_MAY_SUPPORT_SSE2
namespace mozilla {
namespace SSE2 {
void Convert_ascii_run(const char *&src, PRUnichar *&dst, PRInt32 len);
}
}
#endif
static inline void
Convert_ascii_run (const char *&src,
PRUnichar *&dst,
PRInt32 len)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2()) {
mozilla::SSE2::Convert_ascii_run(src, dst, len);
return;
}
#endif
while (len-- > 0 && (*src & 0x80U) == 0) {
*dst++ = (PRUnichar) *src++;
}

View File

@ -0,0 +1,128 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Foundation code.
*
* The Initial Developer of the Original Code is the Mozilla Foundation.
*
* Portions created by the Initial Developer are Copyright (C) 2010
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
// This file should only be compiled if you're on x86 or x86_64. Additionally,
// you'll need to compile this file with -msse2 if you're using gcc.
#include <emmintrin.h>
#include "nscore.h"
namespace mozilla {
namespace SSE2 {
void
Convert_ascii_run(const char *&src,
PRUnichar *&dst,
PRInt32 len)
{
if (len > 15) {
__m128i in, out1, out2;
__m128d *outp1, *outp2;
__m128i zeroes;
PRUint32 offset;
// align input to 16 bytes
while ((NS_PTR_TO_UINT32(src) & 15) && len > 0) {
if (*src & 0x80U)
return;
*dst++ = (PRUnichar) *src++;
len--;
}
zeroes = _mm_setzero_si128();
offset = NS_PTR_TO_UINT32(dst) & 15;
// Note: all these inner loops have to break, not return; we need
// to let the single-char loop below catch any leftover
// byte-at-a-time ASCII chars, since this function must consume
// all available ASCII chars before it returns
if (offset == 0) {
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_stream_si128((__m128i *) dst, out1);
_mm_stream_si128((__m128i *) (dst + 8), out2);
dst += 16;
src += 16;
len -= 16;
}
} else if (offset == 8) {
outp1 = (__m128d *) &out1;
outp2 = (__m128d *) &out2;
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_storel_epi64((__m128i *) dst, out1);
_mm_storel_epi64((__m128i *) (dst + 8), out2);
_mm_storeh_pd((double *) (dst + 4), *outp1);
_mm_storeh_pd((double *) (dst + 12), *outp2);
src += 16;
dst += 16;
len -= 16;
}
} else {
while (len > 15) {
in = _mm_load_si128((__m128i *) src);
if (_mm_movemask_epi8(in))
break;
out1 = _mm_unpacklo_epi8(in, zeroes);
out2 = _mm_unpackhi_epi8(in, zeroes);
_mm_storeu_si128((__m128i *) dst, out1);
_mm_storeu_si128((__m128i *) (dst + 8), out2);
src += 16;
dst += 16;
len -= 16;
}
}
}
// finish off a byte at a time
while (len-- > 0 && (*src & 0x80U) == 0) {
*dst++ = (PRUnichar) *src++;
}
}
} // namespace SSE2
} // namespace mozilla