// Copyright (c) 2012- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include #include "ppsspp_config.h" #include "CPUDetect.h" #include "Common.h" #ifdef _M_SSE #include #endif #if PPSSPP_ARCH(ARM_NEON) #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) #include #else #include #endif #endif #include "IndexGenerator.h" // Points don't need indexing... const u8 IndexGenerator::indexedPrimitiveType[7] = { GE_PRIM_POINTS, GE_PRIM_LINES, GE_PRIM_LINES, GE_PRIM_TRIANGLES, GE_PRIM_TRIANGLES, GE_PRIM_TRIANGLES, GE_PRIM_RECTANGLES, }; void IndexGenerator::Setup(u16 *inds) { this->indsBase_ = inds; Reset(); } void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) { switch (prim) { case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break; case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break; case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break; case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break; case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same } } void IndexGenerator::AddPoints(int numVerts, int indexOffset) { u16 *outInds = inds_; for (int i = 0; i < numVerts; i++) *outInds++ = indexOffset + i; inds_ = outInds; } void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) { u16 *outInds = inds_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numVerts; i += 3) { *outInds++ = indexOffset + i; *outInds++ = indexOffset + i + v1; *outInds++ = indexOffset + i + v2; } inds_ = outInds; } alignas(16) static const u16 offsets_clockwise[24] = { 0, (u16)(0 + 1), (u16)(0 + 2), 1, (u16)(1 + 2), (u16)(1 + 1), 2, (u16)(2 + 1), (u16)(2 + 2), 3, (u16)(3 + 2), (u16)(3 + 1), 4, (u16)(4 + 1), (u16)(4 + 2), 5, (u16)(5 + 2), (u16)(5 + 1), 6, (u16)(6 + 1), (u16)(6 + 2), 7, (u16)(7 + 2), (u16)(7 + 1), }; alignas(16) static const uint16_t offsets_counter_clockwise[24] = { 0, (u16)(0 + 2), (u16)(0 + 1), 1, (u16)(1 + 1), (u16)(1 + 2), 2, (u16)(2 + 2), (u16)(2 + 1), 3, (u16)(3 + 1), (u16)(3 + 2), 4, (u16)(4 + 2), (u16)(4 + 1), 5, (u16)(5 + 1), (u16)(5 + 2), 6, (u16)(6 + 2), (u16)(6 + 1), 7, (u16)(7 + 1), (u16)(7 + 2), }; void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) { int numTris = numVerts - 2; if (numTris <= 0) { return; } #ifdef _M_SSE // In an SSE2 register we can fit 8 16-bit integers. // However, we need to output a multiple of 3 indices. // The first such multiple is 24, which means we'll generate 24 indices per cycle, // which corresponds to 8 triangles. That's pretty cool. // We allow ourselves to write some extra indices to avoid the fallback loop. // That's alright as we're appending to a buffer - they will get overwritten anyway. __m128i ibase8 = _mm_set1_epi16(indexOffset); const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise); __m128i *dst = (__m128i *)inds_; __m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets)); // A single store is always enough for two triangles, which is a very common case. _mm_storeu_si128(dst, offsets0); if (numTris > 2) { __m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1)); _mm_storeu_si128(dst + 1, offsets1); if (numTris > 5) { __m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2)); _mm_storeu_si128(dst + 2, offsets2); __m128i increment = _mm_set1_epi16(8); int numChunks = (numTris + 7) >> 3; for (int i = 1; i < numChunks; i++) { dst += 3; offsets0 = _mm_add_epi16(offsets0, increment); offsets1 = _mm_add_epi16(offsets1, increment); offsets2 = _mm_add_epi16(offsets2, increment); _mm_storeu_si128(dst, offsets0); _mm_storeu_si128(dst + 1, offsets1); _mm_storeu_si128(dst + 2, offsets2); } } } inds_ += numTris * 3; // wind doesn't need to be updated, an even number of triangles have been drawn. #elif PPSSPP_ARCH(ARM_NEON) uint16x8_t ibase8 = vdupq_n_u16(indexOffset); const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise; u16 *dst = inds_; uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets)); vst1q_u16(dst, offsets0); if (numTris > 2) { uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8)); vst1q_u16(dst + 8, offsets1); if (numTris > 5) { uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16)); vst1q_u16(dst + 16, offsets2); uint16x8_t increment = vdupq_n_u16(8); int numChunks = (numTris + 7) >> 3; for (int i = 1; i < numChunks; i++) { dst += 3 * 8; offsets0 = vaddq_u16(offsets0, increment); offsets1 = vaddq_u16(offsets1, increment); offsets2 = vaddq_u16(offsets2, increment); vst1q_u16(dst, offsets0); vst1q_u16(dst + 8, offsets1); vst1q_u16(dst + 16, offsets2); } } } inds_ += numTris * 3; #else // Slow fallback loop. int wind = clockwise ? 1 : 2; int ibase = indexOffset; size_t numPairs = numTris / 2; u16 *outInds = inds_; while (numPairs > 0) { *outInds++ = ibase; *outInds++ = ibase + wind; *outInds++ = ibase + (wind ^ 3); *outInds++ = ibase + 1; *outInds++ = ibase + 1 + (wind ^ 3); *outInds++ = ibase + 1 + wind; ibase += 2; numPairs--; } if (numTris & 1) { *outInds++ = ibase; *outInds++ = ibase + wind; wind ^= 3; // toggle between 1 and 2 *outInds++ = ibase + wind; } inds_ = outInds; #endif } void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) { const int numTris = numVerts - 2; u16 *outInds = inds_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numTris; i++) { *outInds++ = indexOffset; *outInds++ = indexOffset + i + v1; *outInds++ = indexOffset + i + v2; } inds_ = outInds; } //Lines void IndexGenerator::AddLineList(int numVerts, int indexOffset) { u16 *outInds = inds_; for (int i = 0; i < numVerts; i += 2) { *outInds++ = indexOffset + i; *outInds++ = indexOffset + i + 1; } inds_ = outInds; } void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) { const int numLines = numVerts - 1; u16 *outInds = inds_; for (int i = 0; i < numLines; i++) { *outInds++ = indexOffset + i; *outInds++ = indexOffset + i + 1; } inds_ = outInds; } void IndexGenerator::AddRectangles(int numVerts, int indexOffset) { u16 *outInds = inds_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numVerts = numVerts & ~1; for (int i = 0; i < numVerts; i += 2) { *outInds++ = indexOffset + i; *outInds++ = indexOffset + i + 1; } inds_ = outInds; } template void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) { u16 *outInds = inds_; for (int i = 0; i < numInds; i++) *outInds++ = indexOffset + inds[i]; inds_ = outInds; } template void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) { u16 *outInds = inds_; numInds = numInds & ~1; for (int i = 0; i < numInds; i += 2) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + 1]; } inds_ = outInds; } template void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) { int numLines = numInds - 1; u16 *outInds = inds_; for (int i = 0; i < numLines; i++) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + 1]; } inds_ = outInds; } template void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { // We only bother doing this minor optimization in triangle list, since it's by far the most // common operation that can benefit. if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) { memcpy(inds_, inds, numInds * sizeof(ITypeLE)); inds_ += numInds; } else { u16 *outInds = inds_; int numTris = numInds / 3; // Round to whole triangles numInds = numTris * 3; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; // TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise. for (int i = 0; i < numInds; i += 3) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + v1]; *outInds++ = indexOffset + inds[i + v2]; } inds_ = outInds; } } template void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { int wind = clockwise ? 1 : 2; int numTris = numInds - 2; u16 *outInds = inds_; for (int i = 0; i < numTris; i++) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i + wind]; wind ^= 3; // Toggle between 1 and 2 *outInds++ = indexOffset + inds[i + wind]; } inds_ = outInds; } template void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) { if (numInds <= 0) return; int numTris = numInds - 2; u16 *outInds = inds_; const int v1 = clockwise ? 1 : 2; const int v2 = clockwise ? 2 : 1; for (int i = 0; i < numTris; i++) { *outInds++ = indexOffset + inds[0]; *outInds++ = indexOffset + inds[i + v1]; *outInds++ = indexOffset + inds[i + v2]; } inds_ = outInds; } template inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) { u16 *outInds = inds_; //rectangles always need 2 vertices, disregard the last one if there's an odd number numInds = numInds & ~1; for (int i = 0; i < numInds; i += 2) { *outInds++ = indexOffset + inds[i]; *outInds++ = indexOffset + inds[i+1]; } inds_ = outInds; } // Could template this too, but would have to define in header. void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise) { switch (prim) { case GE_PRIM_POINTS: TranslatePoints(numInds, inds, indexOffset); break; case GE_PRIM_LINES: TranslateLineList(numInds, inds, indexOffset); break; case GE_PRIM_LINE_STRIP: TranslateLineStrip(numInds, inds, indexOffset); break; case GE_PRIM_TRIANGLES: TranslateList(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_STRIP: TranslateStrip(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_FAN: TranslateFan(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_RECTANGLES: TranslateRectangles(numInds, inds, indexOffset); break; // Same } } void IndexGenerator::TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise) { switch (prim) { case GE_PRIM_POINTS: TranslatePoints(numInds, inds, indexOffset); break; case GE_PRIM_LINES: TranslateLineList(numInds, inds, indexOffset); break; case GE_PRIM_LINE_STRIP: TranslateLineStrip(numInds, inds, indexOffset); break; case GE_PRIM_TRIANGLES: TranslateList(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_STRIP: TranslateStrip(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_FAN: TranslateFan(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_RECTANGLES: TranslateRectangles(numInds, inds, indexOffset); break; // Same } } void IndexGenerator::TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise) { switch (prim) { case GE_PRIM_POINTS: TranslatePoints(numInds, inds, indexOffset); break; case GE_PRIM_LINES: TranslateLineList(numInds, inds, indexOffset); break; case GE_PRIM_LINE_STRIP: TranslateLineStrip(numInds, inds, indexOffset); break; case GE_PRIM_TRIANGLES: TranslateList(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_STRIP: TranslateStrip(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_TRIANGLE_FAN: TranslateFan(numInds, inds, indexOffset, clockwise); break; case GE_PRIM_RECTANGLES: TranslateRectangles(numInds, inds, indexOffset); break; // Same } }