Bug 586698 - Add SSE versions of LossyConvertEncoding; r=tterribe,jst

This commit is contained in:
Justin Lebar 2010-08-13 09:15:44 -07:00
parent 10ebb07512
commit b774f4accc
5 changed files with 192 additions and 37 deletions

View File

@ -227,9 +227,7 @@ nsTextFragment::SetTo(const PRUnichar* aBuffer, PRInt32 aLength)
}
// Copy data
// Use the same copying code we use elsewhere; it's likely to be
// carefully tuned.
LossyConvertEncoding<PRUnichar, char> converter(buff);
LossyConvertEncoding16to8 converter(buff);
copy_string(aBuffer, aBuffer+aLength, converter);
m1b = buff;
}
@ -260,9 +258,8 @@ nsTextFragment::CopyTo(PRUnichar *aDest, PRInt32 aOffset, PRInt32 aCount)
} else {
const char *cp = m1b + aOffset;
const char *end = cp + aCount;
while (cp < end) {
*aDest++ = (unsigned char)(*cp++);
}
LossyConvertEncoding8to16 converter(aDest);
copy_string(cp, end, converter);
}
}
}
@ -316,11 +313,10 @@ nsTextFragment::Append(const PRUnichar* aBuffer, PRUint32 aLength)
return;
}
// Copy data
for (PRUint32 i = 0; i < mState.mLength; ++i) {
buff[i] = (unsigned char)m1b[i];
}
// Copy data into buff
LossyConvertEncoding8to16 converter(buff);
copy_string(m1b, m1b+mState.mLength, converter);
memcpy(buff + mState.mLength, aBuffer, aLength * sizeof(PRUnichar));
mState.mLength += aLength;
@ -354,10 +350,10 @@ nsTextFragment::Append(const PRUnichar* aBuffer, PRUint32 aLength)
memcpy(buff, m1b, mState.mLength);
mState.mInHeap = PR_TRUE;
}
for (PRUint32 i = 0; i < aLength; ++i) {
buff[mState.mLength + i] = (char)aBuffer[i];
}
// Copy aBuffer into buff.
LossyConvertEncoding16to8 converter(buff + mState.mLength);
copy_string(aBuffer, aBuffer + aLength, converter);
m1b = buff;
mState.mLength += aLength;

View File

@ -43,6 +43,7 @@
// use XPCOM assertion/debugging macros, etc.
#include "nscore.h"
#include "mozilla/SSE.h"
#include "nsCharTraits.h"
@ -662,39 +663,89 @@ class CalculateUTF8Size
#ifdef MOZILLA_INTERNAL_API
/**
* A character sink that performs a |reinterpret_cast| style conversion
* between character types.
* A character sink that performs a |reinterpret_cast|-style conversion
* from char to PRUnichar.
*/
template <class FromCharT, class ToCharT>
class LossyConvertEncoding
class LossyConvertEncoding8to16
{
public:
typedef FromCharT value_type;
typedef FromCharT input_type;
typedef ToCharT output_type;
typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
typedef char value_type;
typedef char input_type;
typedef PRUnichar output_type;
public:
LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
LossyConvertEncoding8to16( PRUnichar* aDestination ) :
mDestination(aDestination) { }
void
write( const input_type* aSource, PRUint32 aSourceLength )
write( const char* aSource, PRUint32 aSourceLength )
{
const input_type* done_writing = aSource + aSourceLength;
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2())
{
write_sse2(aSource, aSourceLength);
return;
}
#endif
const char* done_writing = aSource + aSourceLength;
while ( aSource < done_writing )
*mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior
*mDestination++ = (PRUnichar)(unsigned char)(*aSource++);
}
void
write_sse2( const char* aSource, PRUint32 aSourceLength );
void
write_terminator()
{
*mDestination = output_type(0);
*mDestination = (PRUnichar)(0);
}
private:
output_type* mDestination;
PRUnichar* mDestination;
};
/**
* A character sink that performs a |reinterpret_cast|-style conversion
* from PRUnichar to char.
*/
class LossyConvertEncoding16to8
{
public:
typedef PRUnichar value_type;
typedef PRUnichar input_type;
typedef char output_type;
LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
void
write( const PRUnichar* aSource, PRUint32 aSourceLength)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2())
{
write_sse2(aSource, aSourceLength);
return;
}
#endif
const PRUnichar* done_writing = aSource + aSourceLength;
while ( aSource < done_writing )
*mDestination++ = (char)(*aSource++);
}
#ifdef MOZILLA_MAY_SUPPORT_SSE2
void
write_sse2( const PRUnichar* aSource, PRUint32 aSourceLength );
#endif
void
write_terminator()
{
*mDestination = '\0';
}
private:
char *mDestination;
};
#endif // MOZILLA_INTERNAL_API

View File

@ -70,6 +70,18 @@ FORCE_STATIC_LIB = 1
# Force use of PIC
FORCE_USE_PIC = 1
# Are we targeting x86 or x86-64? If so, compile the SSE2 functions for
# nsUTF8Utils.cpp.
ifneq (,$(INTEL_ARCHITECTURE))
CPPSRCS += nsUTF8UtilsSSE2.cpp
# gcc requires -msse2 on nsUTF8UtilsSSE2.cpp since it uses SSE2 intrinsics.
# (See bug 585538 comment 12.)
ifdef GNU_CC
nsUTF8UtilsSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
endif
endif
include $(topsrcdir)/config/rules.mk
DEFINES += -D_IMPL_NS_COM

View File

@ -145,9 +145,9 @@ LossyAppendUTF16toASCII( const nsAString& aSource, nsACString& aDest )
dest.advance(old_dest_length);
// right now, this won't work on multi-fragment destinations
LossyConvertEncoding<PRUnichar, char> converter(dest.get());
// right now, this won't work on multi-fragment destinations
LossyConvertEncoding16to8 converter(dest.get());
copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter);
}
@ -167,7 +167,7 @@ AppendASCIItoUTF16( const nsACString& aSource, nsAString& aDest )
dest.advance(old_dest_length);
// right now, this won't work on multi-fragment destinations
LossyConvertEncoding<char, PRUnichar> converter(dest.get());
LossyConvertEncoding8to16 converter(dest.get());
copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter);
}
@ -303,7 +303,7 @@ ToNewCString( const nsAString& aSource )
return nsnull;
nsAString::const_iterator fromBegin, fromEnd;
LossyConvertEncoding<PRUnichar, char> converter(result);
LossyConvertEncoding16to8 converter(result);
copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter).write_terminator();
return result;
}
@ -374,7 +374,7 @@ ToNewUnicode( const nsACString& aSource )
return nsnull;
nsACString::const_iterator fromBegin, fromEnd;
LossyConvertEncoding<char, PRUnichar> converter(result);
LossyConvertEncoding8to16 converter(result);
copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter).write_terminator();
return result;
}

View File

@ -0,0 +1,96 @@
#include "nscore.h"
#include <emmintrin.h>
#include <nsUTF8Utils.h>
void
LossyConvertEncoding16to8::write_sse2(const PRUnichar* aSource,
PRUint32 aSourceLength)
{
char* dest = mDestination;
// Align source to a 16-byte boundary.
PRUint32 i = 0;
PRUint32 alignLen =
PR_MIN(aSourceLength, (-NS_PTR_TO_UINT32(aSource) & 0xf) / sizeof(PRUnichar));
for (; i < alignLen; i++) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
// Walk 64 bytes (four XMM registers) at a time.
__m128i vectmask = _mm_set1_epi16(0x00ff);
for (; aSourceLength - i > 31; i += 32) {
__m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
source1 = _mm_and_si128(source1, vectmask);
__m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
source2 = _mm_and_si128(source2, vectmask);
__m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
source3 = _mm_and_si128(source3, vectmask);
__m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
source4 = _mm_and_si128(source4, vectmask);
// Pack the source data. SSE2 views this as a saturating uint16 to
// uint8 conversion, but since we masked off the high-order byte of every
// uint16, we're really just grabbing the low-order bytes of source1 and
// source2.
__m128i packed1 = _mm_packus_epi16(source1, source2);
__m128i packed2 = _mm_packus_epi16(source3, source4);
// This store needs to be unaligned since there's no guarantee that the
// alignment we did above for the source will align the destination.
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
}
// Finish up the rest.
for (; i < aSourceLength; i++) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}
void
LossyConvertEncoding8to16::write_sse2(const char* aSource,
PRUint32 aSourceLength)
{
PRUnichar *dest = mDestination;
// Align source to a 16-byte boundary. We choose to align source rather than
// dest because we'd rather have our loads than our stores be fast. You have
// to wait for a load to complete, but you can keep on moving after issuing a
// store.
PRUint32 i = 0;
PRUint32 alignLen = PR_MIN(aSourceLength, (-NS_PTR_TO_UINT32(aSource) & 0xf));
for (; i < alignLen; i++) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
// Walk 32 bytes (two XMM registers) at a time.
for (; aSourceLength - i > 31; i += 32) {
__m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
__m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
// Interleave 0s in with the bytes of source to create lo and hi.
__m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
__m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
__m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
__m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
// store lo and hi into dest.
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
}
// Finish up whatever's left.
for (; i < aSourceLength; i++) {
dest[i] = static_cast<unsigned char>(aSource[i]);
}
mDestination += i;
}