CMake/Source/cm_utf8.c
Brad King 86578eccf2 Simplify CMake per-source license notices
Per-source copyright/license notice headers that spell out copyright holder
names and years are hard to maintain and often out-of-date or plain wrong.
Precise contributor information is already maintained automatically by the
version control tool.  Ultimately it is the receiver of a file who is
responsible for determining its licensing status, and per-source notices are
merely a convenience.  Therefore it is simpler and more accurate for
each source to have a generic notice of the license name and references to
more detailed information on copyright holders and full license terms.

Our `Copyright.txt` file now contains a list of Contributors whose names
appeared source-level copyright notices.  It also references version control
history for more precise information.  Therefore we no longer need to spell
out the list of Contributors in each source file notice.

Replace CMake per-source copyright/license notice headers with a short
description of the license and links to `Copyright.txt` and online information
available from "https://cmake.org/licensing".  The online URL also handles
cases of modules being copied out of our source into other projects, so we
can drop our notices about replacing links with full license text.

Run the `Utilities/Scripts/filter-notices.bash` script to perform the majority
of the replacements mechanically.  Manually fix up shebang lines and trailing
newlines in a few files.  Manually update the notices in a few files that the
script does not handle.
2016-09-27 15:14:44 -04:00

78 lines
2.6 KiB
C

/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
file Copyright.txt or https://cmake.org/licensing for details. */
#include "cm_utf8.h"
/*
RFC 3629
07-bit: 0xxxxxxx
11-bit: 110xxxxx 10xxxxxx
16-bit: 1110xxxx 10xxxxxx 10xxxxxx
21-bit: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Pre-RFC Compatibility
26-bit: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
31-bit: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
/* Number of leading ones before a zero in the byte. */
static unsigned char const cm_utf8_ones[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8
};
/* Mask away control bits from bytes with n leading ones. */
static unsigned char const cm_utf8_mask[7] = { 0xEF, 0x3F, 0x1F, 0x0F,
0x07, 0x03, 0x01 };
/* Minimum allowed value when first byte has n leading ones. */
static unsigned int const cm_utf8_min[7] = {
0, 0, 1u << 7, 1u << 11, 1u << 16, 1u << 21, 1u << 26 /*, 1u<<31 */
};
const char* cm_utf8_decode_character(const char* first, const char* last,
unsigned int* pc)
{
/* Count leading ones in the first byte. */
unsigned char c = (unsigned char)*first++;
unsigned char const ones = cm_utf8_ones[c];
switch (ones) {
case 0:
*pc = c;
return first; /* One-byte character. */
case 1:
case 7:
case 8:
return 0; /* Invalid leading byte. */
default:
break;
}
/* Extract bits from this multi-byte character. */
{
unsigned int uc = c & cm_utf8_mask[ones];
int left;
for (left = ones - 1; left && first != last; --left) {
c = (unsigned char)*first++;
if (cm_utf8_ones[c] != 1) {
return 0;
}
uc = (uc << 6) | (c & cm_utf8_mask[1]);
}
if (left > 0 || uc < cm_utf8_min[ones]) {
return 0;
}
*pc = uc;
return first;
}
}