ccache/unify.c

/*
 * Copyright (C) 2002 Andrew Tridgell
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51
 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

/*
 * C/C++ unifier
 *
 * The idea is that changes that don't affect the resulting C code should not
 * change the hash. This is achieved by folding white-space and other
 * non-semantic fluff in the input into a single unified format.
 *
 * This unifier was design to match the output of the unifier in compilercache,
 * which is flex based. The major difference is that this unifier is much
 * faster (about 2x) and more forgiving of syntactic errors. Continuing on
 * syntactic errors is important to cope with C/C++ extensions in the local
 * compiler (for example, inline assembly systems).
 */

#include "ccache.h"

static const char *const s_tokens[] = {
	"...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
	"|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
	"==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
	"(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
	"-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
	0
};

#define C_ALPHA 1
#define C_SPACE 2
#define C_TOKEN 4
#define C_QUOTE 8
#define C_DIGIT 16
#define C_HEX   32
#define C_FLOAT 64
#define C_SIGN  128

static struct {
	unsigned char type;
	unsigned char num_toks;
	const char *toks[7];
} tokens[256];

/* build up the table used by the unifier */
static void
build_table(void)
{
	unsigned char c;
	int i;
	static bool done;

	if (done) return;
	done = true;

	memset(tokens, 0, sizeof(tokens));
	for (c = 0; c < 128; c++) {
		if (isalpha(c) || c == '_') tokens[c].type |= C_ALPHA;
		if (isdigit(c)) tokens[c].type |= C_DIGIT;
		if (isspace(c)) tokens[c].type |= C_SPACE;
		if (isxdigit(c)) tokens[c].type |= C_HEX;
	}
	tokens['\''].type |= C_QUOTE;
	tokens['"'].type |= C_QUOTE;
	tokens['l'].type |= C_FLOAT;
	tokens['L'].type |= C_FLOAT;
	tokens['f'].type |= C_FLOAT;
	tokens['F'].type |= C_FLOAT;
	tokens['U'].type |= C_FLOAT;
	tokens['u'].type |= C_FLOAT;

	tokens['-'].type |= C_SIGN;
	tokens['+'].type |= C_SIGN;

	for (i = 0; s_tokens[i]; i++) {
		c = s_tokens[i][0];
		tokens[c].type |= C_TOKEN;
		tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
		tokens[c].num_toks++;
	}
}

/* buffer up characters before hashing them */
static void
pushchar(struct mdfour *hash, unsigned char c)
{
	static unsigned char buf[64];
	static size_t len;

	if (c == 0) {
		if (len > 0) {
			hash_buffer(hash, (char *)buf, len);
			len = 0;
		}
		hash_buffer(hash, NULL, 0);
		return;
	}

	buf[len++] = c;
	if (len == 64) {
		hash_buffer(hash, (char *)buf, len);
		len = 0;
	}
}

/* hash some C/C++ code after unifying */
static void
unify(struct mdfour *hash, unsigned char *p, size_t size)
{
	size_t ofs;
	unsigned char q;
	int i;

	build_table();

	for (ofs = 0; ofs < size;) {
		if (p[ofs] == '#') {
			if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
				do {
					ofs++;
				} while (ofs < size && p[ofs] != '\n');
				ofs++;
			} else {
				do {
					pushchar(hash, p[ofs]);
					ofs++;
				} while (ofs < size && p[ofs] != '\n');
				pushchar(hash, '\n');
				ofs++;
			}
			continue;
		}

		if (tokens[p[ofs]].type & C_ALPHA) {
			do {
				pushchar(hash, p[ofs]);
				ofs++;
			} while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
			pushchar(hash, '\n');
			continue;
		}

		if (tokens[p[ofs]].type & C_DIGIT) {
			do {
				pushchar(hash, p[ofs]);
				ofs++;
			} while (ofs < size &&
				 ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
			if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
				do {
					pushchar(hash, p[ofs]);
					ofs++;
				} while (ofs < size && (tokens[p[ofs]].type & C_HEX));
			}
			if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
				pushchar(hash, p[ofs]);
				ofs++;
				while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
					pushchar(hash, p[ofs]);
					ofs++;
				}
			}
			while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
				pushchar(hash, p[ofs]);
				ofs++;
			}
			pushchar(hash, '\n');
			continue;
		}

		if (tokens[p[ofs]].type & C_SPACE) {
			do {
				ofs++;
			} while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
			continue;
		}

		if (tokens[p[ofs]].type & C_QUOTE) {
			q = p[ofs];
			pushchar(hash, p[ofs]);
			do {
				ofs++;
				while (ofs < size-1 && p[ofs] == '\\') {
					pushchar(hash, p[ofs]);
					pushchar(hash, p[ofs+1]);
					ofs += 2;
				}
				pushchar(hash, p[ofs]);
			} while (ofs < size && p[ofs] != q);
			pushchar(hash, '\n');
			ofs++;
			continue;
		}

		if (tokens[p[ofs]].type & C_TOKEN) {
			q = p[ofs];
			for (i = 0; i < tokens[q].num_toks; i++) {
				unsigned char *s = (unsigned char *)tokens[q].toks[i];
				int len = strlen((char *)s);
				if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
					int j;
					for (j = 0; s[j]; j++) {
						pushchar(hash, s[j]);
						ofs++;
					}
					pushchar(hash, '\n');
					break;
				}
			}
			if (i < tokens[q].num_toks) {
				continue;
			}
		}

		pushchar(hash, p[ofs]);
		pushchar(hash, '\n');
		ofs++;
	}
	pushchar(hash, 0);
}


/* hash a file that consists of preprocessor output, but remove any line
   number information from the hash
*/
int
unify_hash(struct mdfour *hash, const char *fname)
{
	char *data;
	size_t size;

	if (!read_file(fname, 0, &data, &size)) {
		stats_update(STATS_PREPROCESSOR);
		return -1;
	}
	unify(hash, (unsigned char *)data, size);
	free(data);
	return 0;
}