2017-01-09 02:18:14 +00:00
|
|
|
/* radare - LGPL - Copyright 2009-2017 - pancake, nikolai */
|
2009-02-05 21:08:46 +00:00
|
|
|
|
|
|
|
#include <r_diff.h>
|
|
|
|
|
2015-06-04 22:53:36 +00:00
|
|
|
//R_LIB_VERSION (r_diff);
|
2013-06-15 00:56:25 +00:00
|
|
|
|
2016-05-25 07:07:23 +00:00
|
|
|
R_API RDiff *r_diff_new_from(ut64 off_a, ut64 off_b) {
|
|
|
|
RDiff *d = R_NEW0 (RDiff);
|
2010-05-20 15:40:58 +00:00
|
|
|
if (d) {
|
|
|
|
d->delta = 1;
|
|
|
|
d->user = NULL;
|
|
|
|
d->off_a = off_a;
|
|
|
|
d->off_b = off_b;
|
|
|
|
}
|
2010-03-08 11:45:22 +00:00
|
|
|
return d;
|
|
|
|
}
|
|
|
|
|
2016-05-25 07:07:23 +00:00
|
|
|
R_API RDiff *r_diff_new() {
|
|
|
|
return r_diff_new_from (0, 0);
|
|
|
|
}
|
|
|
|
|
2010-03-08 11:45:22 +00:00
|
|
|
R_API RDiff *r_diff_free(RDiff *d) {
|
|
|
|
free (d);
|
2009-02-05 21:08:46 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2010-03-08 11:45:22 +00:00
|
|
|
R_API int r_diff_set_callback(RDiff *d, RDiffCallback callback, void *user) {
|
2009-02-05 21:08:46 +00:00
|
|
|
d->callback = callback;
|
|
|
|
d->user = user;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2010-03-08 11:45:22 +00:00
|
|
|
R_API int r_diff_set_delta(RDiff *d, int delta) {
|
2009-02-05 21:08:46 +00:00
|
|
|
d->delta = delta;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2010-03-08 11:45:22 +00:00
|
|
|
R_API int r_diff_buffers_static(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) {
|
2009-02-05 21:08:46 +00:00
|
|
|
int i, len;
|
|
|
|
int hit = 0;
|
|
|
|
la = R_ABS(la);
|
|
|
|
lb = R_ABS(lb);
|
|
|
|
if (la != lb) {
|
|
|
|
len = R_MIN(la, lb);
|
|
|
|
fprintf(stderr,
|
|
|
|
"Buffer truncated to %d bytes (%d not compared)\n",
|
|
|
|
len, R_ABS(lb-la));
|
2017-01-09 02:18:14 +00:00
|
|
|
} else {
|
|
|
|
len = la;
|
|
|
|
}
|
2009-02-05 21:08:46 +00:00
|
|
|
for(i = 0; i<len; i++) {
|
|
|
|
if (a[i]!=b[i]) {
|
|
|
|
hit++;
|
|
|
|
} else {
|
|
|
|
if (hit>0) {
|
|
|
|
struct r_diff_op_t o = {
|
2017-01-09 02:18:14 +00:00
|
|
|
.a_off = d->off_a+i-hit, .a_buf = a+i-hit, .a_len = la,
|
|
|
|
.b_off = d->off_b+i-hit, .b_buf = b+i-hit, .b_len = lb
|
2009-02-05 21:08:46 +00:00
|
|
|
};
|
2010-02-05 11:21:37 +00:00
|
|
|
d->callback (d, d->user, &o);
|
2009-02-05 21:08:46 +00:00
|
|
|
hit = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (hit>0) {
|
|
|
|
struct r_diff_op_t o = {
|
|
|
|
.a_off = d->off_a+i-hit, .a_buf = a+i-hit, .a_len = hit,
|
2014-05-02 23:35:44 +00:00
|
|
|
.b_off = d->off_b+i-hit, .b_buf = b+i-hit, .b_len = hit
|
2009-02-05 21:08:46 +00:00
|
|
|
};
|
2010-02-05 11:21:37 +00:00
|
|
|
d->callback (d, d->user, &o);
|
2009-02-05 21:08:46 +00:00
|
|
|
hit = 0;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-05-02 23:35:44 +00:00
|
|
|
// XXX: temporary files are
|
2017-01-09 02:18:14 +00:00
|
|
|
R_API int r_diff_buffers_unified(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) {
|
|
|
|
if (r_mem_is_printable (a, R_MIN (5, la))) {
|
|
|
|
r_file_dump (".a", a, la, 0);
|
|
|
|
r_file_dump (".b", b, lb, 0);
|
|
|
|
} else {
|
|
|
|
r_file_hexdump (".a", a, la, 0);
|
|
|
|
r_file_hexdump (".b", b, lb, 0);
|
2009-02-05 21:08:46 +00:00
|
|
|
}
|
2017-01-09 02:18:14 +00:00
|
|
|
r_sys_cmd ("diff -ru .a .b");
|
|
|
|
r_file_rm (".a");
|
|
|
|
r_file_rm (".b");
|
2009-02-05 21:08:46 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-03-08 11:45:22 +00:00
|
|
|
R_API int r_diff_buffers(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb) {
|
2016-05-25 07:07:23 +00:00
|
|
|
if (d->delta) {
|
2010-02-05 11:21:37 +00:00
|
|
|
return r_diff_buffers_delta (d, a, la, b, lb);
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
2010-02-05 11:21:37 +00:00
|
|
|
return r_diff_buffers_static (d, a, la, b, lb);
|
2009-02-05 21:08:46 +00:00
|
|
|
}
|
2009-03-31 14:52:58 +00:00
|
|
|
|
2016-09-21 10:07:44 +00:00
|
|
|
R_API bool r_diff_buffers_distance_levenstein(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
|
2016-05-25 07:07:23 +00:00
|
|
|
const bool verbose = d? d->verbose: false;
|
2016-06-21 02:54:45 +00:00
|
|
|
/*
|
2016-05-25 07:07:23 +00:00
|
|
|
More memory efficient version on Levenshtein Distance from:
|
|
|
|
https://en.wikipedia.org/wiki/Levenshtein_distance
|
|
|
|
http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
|
|
|
|
ObM..
|
2016-06-21 02:54:45 +00:00
|
|
|
|
|
|
|
8/July/2016 - More time efficient Levenshtein Distance. Now runs in about O(N*sum(MDistance)) instead of O(NM)
|
|
|
|
In real world testing the speedups for similar files are immense. Processing of
|
|
|
|
radiff2 -sV routerA/firmware_extract/bin/httpd routerB/firmware_extract/bin/httpd
|
|
|
|
reduced from 28 hours to about 13 minutes.
|
2016-05-25 07:07:23 +00:00
|
|
|
*/
|
|
|
|
int i, j;
|
2016-06-21 02:54:45 +00:00
|
|
|
const ut8 *aBufPtr;
|
|
|
|
const ut8 *bBufPtr;
|
|
|
|
ut32 aLen;
|
|
|
|
ut32 bLen;
|
|
|
|
|
2016-07-17 10:09:31 +00:00
|
|
|
// temp pointer will be used to switch v0 and v1 after processing the inner loop.
|
|
|
|
int *temp;
|
|
|
|
int *v0, *v1;
|
|
|
|
|
|
|
|
// We need these variables outside the context of the loops as we need to
|
|
|
|
// survive multiple loop iterations.
|
|
|
|
// start and stop are used in our inner loop
|
|
|
|
// colMin tells us the current 'best' edit distance.
|
|
|
|
// extendStop & extendStart are used when we get 'double up' edge conditions
|
|
|
|
// that require us to keep some more data.
|
|
|
|
int start = 0;
|
|
|
|
int stop = 0;
|
|
|
|
int smallest;
|
|
|
|
int colMin = 0;
|
|
|
|
int extendStop = 0;
|
|
|
|
int extendStart = 0;
|
|
|
|
|
|
|
|
//we could move cost into the 'i' loop.
|
|
|
|
int cost = 0;
|
|
|
|
|
|
|
|
// loops can get very big, this can be removed, but it's currently in there for debugging
|
|
|
|
// and optimisation testing.
|
|
|
|
ut64 loops = 0;
|
|
|
|
|
2016-06-21 02:54:45 +00:00
|
|
|
// We need the longest file to be 'A' because our optimisation tries to stop and start
|
|
|
|
// around the diagonal.
|
|
|
|
// AAAAAAA
|
|
|
|
// B*
|
|
|
|
// B *
|
|
|
|
// B *____
|
|
|
|
// if we have them the other way around and we terminate on the diagonal, we won't have
|
|
|
|
// inspected all the bytes of file B..
|
|
|
|
// AAAA
|
|
|
|
// B*
|
|
|
|
// B *
|
|
|
|
// B *
|
|
|
|
// B *
|
|
|
|
// B ?
|
|
|
|
|
|
|
|
if (la < lb) {
|
|
|
|
aBufPtr = b;
|
|
|
|
bBufPtr = a;
|
|
|
|
aLen = lb;
|
|
|
|
bLen = la;
|
|
|
|
} else {
|
|
|
|
aBufPtr = a;
|
|
|
|
bBufPtr = b;
|
|
|
|
aLen = la;
|
|
|
|
bLen = lb;
|
|
|
|
}
|
2016-07-17 10:09:31 +00:00
|
|
|
stop = bLen;
|
2016-06-21 02:54:45 +00:00
|
|
|
// Preliminary tests
|
|
|
|
|
|
|
|
//Do we have both files a & b, and are they at least one byte?
|
|
|
|
if (!aBufPtr || !bBufPtr || aLen < 1 || bLen < 1) {
|
2016-05-24 20:22:15 +00:00
|
|
|
return false;
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
2009-03-31 14:52:58 +00:00
|
|
|
|
2016-06-21 02:54:45 +00:00
|
|
|
//IF the files are the same size and are identical, then we have matching files
|
|
|
|
if (aLen == bLen && !memcmp (aBufPtr, bBufPtr, aLen)) {
|
2016-05-25 07:07:23 +00:00
|
|
|
if (distance) {
|
2016-01-22 13:22:23 +00:00
|
|
|
*distance = 0;
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
|
|
|
if (similarity) {
|
2016-01-22 13:22:23 +00:00
|
|
|
*similarity = 1.0;
|
2014-04-25 23:01:14 +00:00
|
|
|
}
|
2016-05-25 07:07:23 +00:00
|
|
|
return true;
|
2016-01-11 02:18:43 +00:00
|
|
|
}
|
2016-06-21 02:54:45 +00:00
|
|
|
// Only calloc if we have to do some processing
|
|
|
|
|
|
|
|
// calloc v0 & v1 and check they initialised
|
2016-07-17 10:09:31 +00:00
|
|
|
v0 = (int*) calloc ((bLen + 3), sizeof (int));
|
2016-06-21 02:54:45 +00:00
|
|
|
if (!v0) {
|
2016-07-17 10:09:31 +00:00
|
|
|
eprintf ("Error: cannot allocate %i bytes.", bLen + 3);
|
2016-06-21 02:54:45 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-17 10:09:31 +00:00
|
|
|
v1 = (int*) calloc ((bLen + 3), sizeof (int));
|
2016-06-21 02:54:45 +00:00
|
|
|
if (!v1) {
|
2016-07-17 10:09:31 +00:00
|
|
|
eprintf ("Error: cannot allocate %i bytes", 2 * (bLen + 3));
|
|
|
|
free (v0);
|
2016-06-21 02:54:45 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// initialise v0 and v1.
|
|
|
|
// With optimisiation we only strictly we only need to initialise v0[0..2]=0..2 & v1[0] = 1;
|
|
|
|
for (i = 0; i < bLen + 1 ; i++) {
|
2016-05-25 07:07:23 +00:00
|
|
|
v0[i] = i;
|
2016-06-21 02:54:45 +00:00
|
|
|
v1[i] = i + 1;
|
2009-03-31 14:52:58 +00:00
|
|
|
}
|
2014-05-02 23:35:44 +00:00
|
|
|
|
2016-06-21 02:54:45 +00:00
|
|
|
// Outer loop = the length of the longest input file.
|
|
|
|
for (i = 0; i < aLen; i++) {
|
|
|
|
|
|
|
|
// We're going to stop the inner loop at:
|
|
|
|
// bLen (so we don't run off the end of our array)
|
|
|
|
// or 'two below the diagonal' PLUS any extension we need for 'double up' edge values
|
|
|
|
// (see extendStop for logic)
|
2016-07-17 10:09:31 +00:00
|
|
|
stop = R_MIN ((i + extendStop + 2), bLen);
|
2016-06-21 02:54:45 +00:00
|
|
|
|
|
|
|
// We need a value in the result column (v1[start]).
|
|
|
|
// If you look at the loop below, we need it because we look at v1[j] as one of the
|
|
|
|
// potential shortest edit distances.
|
|
|
|
// In all cases where the edit distance can't 'reach',
|
|
|
|
// the value of v1[start] simply increments.
|
2016-07-26 20:34:00 +00:00
|
|
|
if (start > bLen) {
|
|
|
|
break;
|
|
|
|
}
|
2016-06-21 02:54:45 +00:00
|
|
|
v1[start] = v0[start] + 1;
|
|
|
|
|
|
|
|
// need to have a bigger number in colMin than we'll ever encounter in the inner loop
|
|
|
|
colMin = aLen;
|
|
|
|
|
|
|
|
// Inner loop does all the work:
|
|
|
|
for (j = start; j <= stop; j++) {
|
|
|
|
loops++;
|
|
|
|
|
|
|
|
// The main levenshtein comparison:
|
|
|
|
cost = (aBufPtr[i] == bBufPtr[j]) ? 0 : 1;
|
|
|
|
smallest = R_MIN ((v1[j] + 1), (v0[j + 1] + 1));
|
2016-05-25 07:07:23 +00:00
|
|
|
smallest = R_MIN (smallest, (v0[j] + cost));
|
2016-06-21 02:54:45 +00:00
|
|
|
|
|
|
|
// populate the next two entries in v1.
|
|
|
|
// only really required if this is the last loop.
|
2016-07-26 20:34:00 +00:00
|
|
|
if (j + 2 > bLen + 3) {
|
|
|
|
break;
|
|
|
|
}
|
2016-05-25 07:07:23 +00:00
|
|
|
v1[j + 1] = smallest;
|
2016-06-21 02:54:45 +00:00
|
|
|
v1[j + 2] = smallest + 1;
|
|
|
|
|
|
|
|
// If we have seen a smaller number, it's the new column Minimum
|
2016-07-17 10:09:31 +00:00
|
|
|
colMin = R_MIN ((colMin), (smallest));
|
2016-06-21 02:54:45 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// We're going to start at i+1 next iteration
|
|
|
|
// The column minimum is the current edit distance
|
|
|
|
// This distance is the minimum 'search width' from the optimal 'i' diagonal
|
|
|
|
// The extendStart picks up an edge case where we have a match on the first iteration
|
|
|
|
// We update extendStart after we've set start for the next iteration.
|
|
|
|
start = i + 1 - colMin - extendStart;
|
|
|
|
|
|
|
|
// If the last processed entry is a match, AND
|
|
|
|
// the current byte in 'a' and the previous processed entry in 'b' aren't a match
|
|
|
|
// then we need to extend our search below the optimal 'i' diagonal. because we'll
|
|
|
|
// have a vertical double up condition in our last two values of the results column.
|
|
|
|
// j-2 is used because j++ increments prior to loop exit in the processing loop above.
|
|
|
|
if (!cost && aBufPtr[i] != bBufPtr[j - 2]) {
|
|
|
|
extendStop ++;
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
2010-02-05 11:21:37 +00:00
|
|
|
|
2016-06-21 02:54:45 +00:00
|
|
|
// If new start would be a match then we have a horizontal 'double up'
|
|
|
|
// which means we need to keep an extra row of data
|
|
|
|
// so don't increment the start counter this time, BUT keep
|
|
|
|
// extendStart up our sleeves for next iteration.
|
2016-07-26 20:34:00 +00:00
|
|
|
if (i + 1 < aLen && start < bLen && aBufPtr[i + 1] == bBufPtr[start]) {
|
2016-06-21 02:54:45 +00:00
|
|
|
start --;
|
|
|
|
extendStart ++;
|
|
|
|
}
|
|
|
|
//Switch v0 and v1 pointers via temp pointer
|
|
|
|
temp = v0;
|
|
|
|
v0 = v1;
|
|
|
|
v1 = temp;
|
|
|
|
|
|
|
|
//Print a processing update every 10K of outer loop
|
|
|
|
if (verbose && i % 10000==0) {
|
|
|
|
eprintf ("\rProcessing %d of %d\r", i, aLen);
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
|
|
|
}
|
2016-06-21 02:54:45 +00:00
|
|
|
//Clean up output on loop exit (purely aesthetic)
|
2016-05-25 07:07:23 +00:00
|
|
|
if (verbose) {
|
2016-06-21 02:54:45 +00:00
|
|
|
eprintf ("\rProcessing %d of %d (loops=%llu)\n", i, aLen,loops);
|
2016-05-25 07:07:23 +00:00
|
|
|
}
|
|
|
|
if (distance) {
|
2016-06-21 02:54:45 +00:00
|
|
|
// the final distance is the last byte we processed in the inner loop.
|
|
|
|
// v0 is used instead of v1 because we switched the pointers before exiting the outer loop
|
|
|
|
*distance = v0[stop];
|
2016-05-25 07:07:23 +00:00
|
|
|
if (similarity) {
|
2016-06-21 02:54:45 +00:00
|
|
|
double diff = (double) (*distance) / (double) (R_MAX (aLen, bLen));
|
2016-05-25 07:07:23 +00:00
|
|
|
*similarity = (double)1 - diff;
|
|
|
|
}
|
|
|
|
}
|
2016-06-02 01:19:31 +00:00
|
|
|
free (v0);
|
|
|
|
free (v1);
|
2016-03-22 00:31:10 +00:00
|
|
|
return true;
|
2009-03-31 14:52:58 +00:00
|
|
|
}
|
2016-09-21 10:07:44 +00:00
|
|
|
|
|
|
|
R_API bool r_diff_buffers_distance_original(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
|
|
|
|
int i, j, tmin, **m;
|
|
|
|
ut64 totalsz = 0;
|
|
|
|
|
|
|
|
if (!a || !b || la < 1 || lb < 1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (la == lb && !memcmp (a, b, la)) {
|
|
|
|
if (distance != NULL)
|
|
|
|
*distance = 0;
|
|
|
|
if (similarity != NULL)
|
|
|
|
*similarity = 1.0;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
totalsz = sizeof(int*) * (lb+1);
|
|
|
|
for(i = 0; i <= la; i++) {
|
|
|
|
totalsz += ((lb+1) * sizeof(int));
|
|
|
|
}
|
|
|
|
if (totalsz >= 1024 * 1024 * 1024) { // 1 GB of ram
|
|
|
|
char *szstr = r_num_units (NULL, totalsz);
|
|
|
|
eprintf ("Too much memory required (%s) to run distance diff, Use -c.\n", szstr);
|
|
|
|
free (szstr);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if ((m = malloc ((la+1) * sizeof(int*))) == NULL)
|
|
|
|
return false;
|
|
|
|
for(i = 0; i <= la; i++) {
|
|
|
|
if ((m[i] = malloc ((lb+1) * sizeof(int))) == NULL) {
|
|
|
|
eprintf ("Allocation failed\n");
|
|
|
|
while (i--)
|
|
|
|
free (m[i]);
|
|
|
|
free (m);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i <= la; i++)
|
|
|
|
m[i][0] = i;
|
|
|
|
for (j = 0; j <= lb; j++)
|
|
|
|
m[0][j] = j;
|
|
|
|
|
|
|
|
for (i = 1; i <= la; i++) {
|
|
|
|
for (j = 1; j <= lb; j++) {
|
|
|
|
int cost = (a[i-1] != b[j-1])? 1: 0;
|
|
|
|
tmin = R_MIN (m[i-1][j] + 1, m[i][j-1] + 1);
|
|
|
|
m[i][j] = R_MIN (tmin, m[i-1][j-1] + cost);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (distance) {
|
|
|
|
*distance = m[la][lb];
|
|
|
|
}
|
|
|
|
if (similarity) {
|
|
|
|
*similarity = (double)1 - (double)(m[la][lb])/(double)(R_MAX(la, lb));
|
|
|
|
}
|
|
|
|
|
|
|
|
for(i = 0; i <= la; i++) {
|
|
|
|
free (m[i]);
|
|
|
|
}
|
|
|
|
free (m);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
R_API bool r_diff_buffers_distance(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) {
|
|
|
|
if (d && d->levenstein) {
|
|
|
|
return r_diff_buffers_distance_levenstein (d, a, la, b, lb, distance, similarity);
|
|
|
|
}
|
|
|
|
return r_diff_buffers_distance_original (d, a, la, b, lb, distance, similarity);
|
|
|
|
}
|