Improve speed of temporal macro search

2025-02-17 01:29:06 +00:00 · 2010-11-29 20:24:06 +01:00 · 2010-11-29 20:24:06 +01:00 · fe732f041e
commit fe732f041e
parent 90e9de12fa
3 changed files with 122 additions and 145 deletions
--- a/hashutil.c
+++ b/hashutil.c
@ -19,6 +19,7 @@
 #include "ccache.h"
 #include "hashutil.h"
 #include "murmurhashneutral2.h"
 #include "macroskip.h"
 unsigned
 hash_from_string(void *str)
@ -45,125 +46,84 @@ file_hashes_equal(struct file_hash *fh1, struct file_hash *fh2)
 		&& fh1->size == fh2->size;
 }
-#define HASH(ch) \
+/*
-	do {\
+ * Search for the strings "__DATE__" and "__TIME__" in str.
-		hashbuf[hashbuflen] = ch; \
+ *
-		hashbuflen++; \
+ * Returns a bitmask with HASH_SOURCE_CODE_FOUND_DATE and
-		if (hashbuflen == sizeof(hashbuf)) {\
+ * HASH_SOURCE_CODE_FOUND_TIME set appropriately.
-			hash_buffer(hash, hashbuf, sizeof(hashbuf)); \
+ */
-			hashbuflen = 0; \
+static int
-		} \
+check_for_temporal_macros(const char *str, size_t len)
-	} while (0)
+{
 	int result = 0;
 	/*
 	 * We're using the Boyer-Moore-Horspool algorithm, which searches
 	 * starting from the *end* of the needle.  Our needles are 8 characters
 	 * long, so i starts at 7.
 	 */
 	size_t i = 7;
 	while (i < len) {
 		/*
 		 * Check whether the substring ending at str[i] has the form
 		 * '__...E__'.  On the assumption that 'E' is less common in
 		 * source than '_', we check str[i-2] first.
 		 */
 		if (str[i - 2] == 'E' &&
 		    str[i - 0] == '_' &&
 		    str[i - 7] == '_' &&
 		    str[i - 1] == '_' &&
 		    str[i - 6] == '_') {
 			/*
 			 * Check the remaining characters to see if the
 			 * substring is '__DATE__' or '__TIME__'.
 			 */
 			if (str[i - 5] == 'D' && str[i - 4] == 'A' &&
 			    str[i - 3] == 'T') {
 				result |= HASH_SOURCE_CODE_FOUND_DATE;
 			}
 			else if (str[i - 5] == 'T' && str[i - 4] == 'I' &&
 				 str[i - 3] == 'M') {
 				result |= HASH_SOURCE_CODE_FOUND_TIME;
 			}
 		}
 		/*
 		 * macro_skip tells us how far we can skip forward upon seeing
 		 * str[i] at the end of a substring.
 		 */
 		i += macro_skip[(uint8_t)str[i]];
 	}
 	return result;
 }
 /*
- * Hash a string ignoring comments. Returns a bitmask of HASH_SOURCE_CODE_*
+ * Hash a string. Returns a bitmask of HASH_SOURCE_CODE_* results.
 * results.
 */
 int
 hash_source_code_string(
 	struct mdfour *hash, const char *str, size_t len, const char *path)
 {
 	const char *p;
 	const char *end;
 	char hashbuf[64];
 	size_t hashbuflen = 0;
 	int result = HASH_SOURCE_CODE_OK;
 	extern unsigned sloppiness;
-	p = str;
+	/*
-	end = str + len;
+	 * Check for __DATE__ and __TIME__ if the sloppiness argument tells us
-	while (1) {
+	 * we have to.
-		if (p >= end) {
+	 */
-			goto end;
+	if (!(sloppiness & SLOPPY_TIME_MACROS)) {
-		}
+		result |= check_for_temporal_macros(str, len);
 		switch (*p) {
 		/* Potential start of comment. */
 		case '/':
 			if (p+1 == end) {
 				break;
 			}
 			switch (*(p+1)) {
 			case '*':
 				HASH(' '); /* Don't paste tokens together when removing the comment. */
 				p += 2;
 				while (p+1 < end
 				       && (*p != '*' || *(p+1) != '/')) {
 					if (*p == '\n') {
 						/* Keep line numbers. */
 						HASH('\n');
 					}
 					p++;
 				}
 				if (p+1 == end) {
 					goto end;
 				}
 				p += 2;
 				continue;
 			case '/':
 				p += 2;
 				while (p < end
 				       && (*p != '\n' || *(p-1) == '\\')) {
 					p++;
 				}
 				continue;
 			default:
 				break;
 			}
 			break;
 		/* Start of string. */
 		case '"':
 			HASH(*p);
 			p++;
 			while (p < end && (*p != '"' || *(p-1) == '\\')) {
 				HASH(*p);
 				p++;
 			}
 			if (p == end) {
 				goto end;
 			}
 			break;
 		/* Potential start of volatile macro. */
 		case '_':
 			if (p + 7 < end
 			    && p[1] == '_' && p[5] == 'E'
 			    && p[6] == '_' && p[7] == '_') {
 				if (p[2] == 'D' && p[3] == 'A'
 				    && p[4] == 'T') {
 					result |= HASH_SOURCE_CODE_FOUND_DATE;
 				} else if (p[2] == 'T' && p[3] == 'I'
 				           && p[4] == 'M') {
 					result |= HASH_SOURCE_CODE_FOUND_TIME;
 				}
 				/*
 				 * Of course, we can't be sure that we have found a __{DATE,TIME}__
 				 * that's actually used, but better safe than sorry. And if you do
 				 * something like
 				 *
 				 * #define TIME __TI ## ME__
 				 *
 				 * in your code, you deserve to get a false cache hit.
 				 */
 			}
 			break;
 		default:
 			break;
 		}
 		HASH(*p);
 		p++;
 	}
-end:
+	/*
-	hash_buffer(hash, hashbuf, hashbuflen);
+	 * Hash the source string.
 	 */
 	hash_buffer(hash, str, len);
 	if (sloppiness & SLOPPY_TIME_MACROS) {
 		return 0;
 	}
 	if (result & HASH_SOURCE_CODE_FOUND_DATE) {
 		/*
 		 * Make sure that the hash sum changes if the (potential) expansion of
--- a/macroskip.h
+++ b/macroskip.h
@ -0,0 +1,56 @@
 /*
 * A Boyer-Moore-Horspool skip table used for searching for the strings
 * "__TIME__" and "__DATE__".
 *
 * macro_skip[c] = 8 for all c not in "__TIME__" and "__DATE__".
 *
 * The other characters map as follows:
 *
 *   _ -> 1
 *   A -> 5
 *   D -> 6
 *   E -> 3
 *   I -> 5
 *   M -> 4
 *   T -> 4
 *
 *
 * This was generated with the following Python script:
 *
 * m = {'_': 1,
 *      'A': 5,
 *      'D': 6,
 *      'E': 3,
 *      'I': 5,
 *      'M': 4,
 *      'T': 4}
 *
 * for i in range(0, 256):
 *     if chr(i) in m:
 *         num = m[chr(i)]
 *     else:
 *         num = 8
 *     print ("%d, " % num),
 *
 *     if i % 16 == 15:
 *         print ""
 */
 static const uint32_t macro_skip[] = {
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  5,  8,  8,  6,  3,  8,  8,  8,  5,  8,  8,  8,  4,  8,  8,
 	8,  8,  8,  8,  4,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  1,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 	8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
 };
--- a/test.sh
+++ b/test.sh
@ -866,45 +866,6 @@ EOF
    checkstat 'cache miss' 1
    checkfile stderr-mf.txt "`cat stderr-orig.txt`"
    ##################################################################
    # Check that changes in comments are ignored when hashing.
    testname="changes in comments"
    $CCACHE -C >/dev/null
    $CCACHE -z >/dev/null
    cat <<EOF >comments.h
 /*
 * /* foo comment
 */
 EOF
    backdate comments.h
    cat <<'EOF' >comments.c
 #include "comments.h"
 char test[] = "\
 /* apple */ // banana"; // foo comment
 EOF
    $CCACHE $COMPILER -c comments.c
    checkstat 'cache hit (direct)' 0
    checkstat 'cache hit (preprocessed)' 0
    checkstat 'cache miss' 1
    sed_in_place 's/foo/ignored/' comments.h comments.c
    backdate comments.h
    $CCACHE $COMPILER -c comments.c
    checkstat 'cache hit (direct)' 1
    checkstat 'cache hit (preprocessed)' 0
    checkstat 'cache miss' 1
    # Check that comment-like string contents are hashed.
    sed_in_place 's/apple/orange/' comments.c
    backdate comments.h
    $CCACHE $COMPILER -c comments.c
    checkstat 'cache hit (direct)' 1
    checkstat 'cache hit (preprocessed)' 0
    checkstat 'cache miss' 2
    ##################################################################
    # Check that it is possible to compile and cache an empty source code file.
    testname="empty source file"