/* radare - LGPL - Copyright 2009-2022 - pancake, nikolai */ #include // the non-system-diff doesnt work well #define USE_SYSTEM_DIFF 1 R_API RDiff *r_diff_new_from(ut64 off_a, ut64 off_b) { RDiff *d = R_NEW0 (RDiff); if (d) { d->delta = 1; d->user = NULL; d->off_a = off_a; d->off_b = off_b; d->diff_cmd = strdup ("diff -u"); } return d; } R_API RDiff *r_diff_new(void) { return r_diff_new_from (0, 0); } R_API void r_diff_free(RDiff *d) { if (d) { free (d->diff_cmd); free (d); } } R_API int r_diff_set_callback(RDiff *d, RDiffCallback callback, void *user) { d->callback = callback; d->user = user; return 1; } R_API int r_diff_set_delta(RDiff *d, int delta) { d->delta = delta; return 1; } typedef struct levrow { ut32 *changes; ut32 start, end; } Levrow; static void lev_matrix_free(Levrow *matrix, ut32 len) { size_t i; for (i = 0; i < len; i++) { free (matrix[i].changes); } free (matrix); } static inline void lev_row_adjust(Levrow *row, ut32 maxdst, ut32 rownum, ut32 buflen, ut32 delta) { delta += rownum; ut64 end = (ut64)delta + maxdst; row->end = R_MIN (end, buflen); row->start = delta <= maxdst? 0: delta - maxdst; } static inline Levrow *lev_row_init(Levrow *matrix, ut32 maxdst, ut32 rownum, ut32 buflen, ut32 delta) { r_return_val_if_fail (matrix && !matrix[rownum].changes, false); Levrow *row = matrix + rownum; lev_row_adjust (row, maxdst, rownum, buflen, delta); if ((row->changes = R_NEWS (ut32, row->end - row->start + 1)) == NULL) { return NULL; } return row; } static inline ut32 lev_get_val(Levrow *row, ut32 i) { if (i >= row->start && i <= row->end) { return row->changes[i - row->start]; } return UT32_MAX - 1; // -1 so a +1 with sub weight does not overflow } // obtains array of operations, in reverse order, to get from column to row of // matrix static st32 lev_parse_matrix(Levrow *matrix, ut32 len, bool invert, RLevOp **chgs) { r_return_val_if_fail (len >= 2 && matrix && chgs && !*chgs, -1); Levrow *row = matrix + len - 1; Levrow *prev_row = row - 1; RLevOp a = LEVADD; RLevOp d = LEVDEL; if (invert) { a = LEVDEL; d = LEVADD; } const size_t overflow = (size_t)-1 / (2 * sizeof (RLevOp)); int j = row->end; size_t size = j; RLevOp *changes = R_NEWS (RLevOp, size); if (!changes) { return -1; } size_t insert = 0; while (row != matrix) { // matrix[0] is not processed ut32 sub = lev_get_val (prev_row, j - 1); ut32 del = lev_get_val (prev_row, j); ut32 add = lev_get_val (row, j - 1); if (insert >= size) { if (size >= overflow) { // overflow paranoia free (changes); return -1; } size *= 2; RLevOp *tmp = realloc (changes, size * sizeof (RLevOp)); if (!tmp) { free (changes); return -1; } changes = tmp; } if (sub <= del && sub <= add) { if (sub == lev_get_val (row, j)) { changes[insert++] = LEVNOP; } else { changes[insert++] = LEVSUB; } j--; } else if (del <= add && del <= sub) { changes[insert++] = d; } else { changes[insert++] = a; j--; continue; // continue with same rows } free (row->changes); row->changes = NULL; row = prev_row--; } if (size - insert < j) { if (size > overflow) { // overly paranoid free (changes); return -1; } size += j - (size - insert); RLevOp *tmp = realloc (changes, size * sizeof (RLevOp)); if (!tmp) { free (changes); return -1; } changes = tmp; } while (j > 0) { changes[insert++] = a; j--; } *chgs = changes; return insert; } static inline void lev_fill_changes(RLevOp *chgs, RLevOp op, ut32 count) { while (count > 0) { count--; chgs[count] = op; } } typedef struct { RDiff *d; char *str; } RDiffUser; #if USE_SYSTEM_DIFF R_API char *r_diff_buffers_tostring(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) { return r_diff_buffers_unified (d, a, la, b, lb); } #else // XXX buffers_static doesnt constructs the correct string in this callback static int tostring(RDiff *d, void *user, RDiffOp *op) { RDiffUser *u = (RDiffUser *)user; if (op->a_len > 0) { char *a_str = r_str_ndup ((const char *)op->a_buf + op->a_off, op->a_len); u->str = r_str_appendf (u->str, "+(%s)", a_str); #if 0 char *bufasm = r_str_prefix_all (a_str, "- "); u->str = r_str_appendf (u->str, "-(%s)", bufasm); free (bufasm); #endif free (a_str); } if (op->b_len > 0) { char *b_str = r_str_ndup ((const char *)op->b_buf + op->b_off, op->b_len); u->str = r_str_appendf (u->str, "+(%s)", b_str); #if 0 char *bufasm = r_str_prefix_all (b_str, "+ "); u->str = r_str_appendf (u->str, "+(%s)", bufasm); free (bufasm); #endif free (b_str); } if (op->a_len == op->b_len) { char *b_str = r_str_ndup ((const char *)op->a_buf + op->a_off, op->a_len); // char *bufasm = r_str_prefix_all (b_str, " "); u->str = r_str_appendf (u->str, "%s", b_str); // free (bufasm); free (b_str); } return 1; } R_API char *r_diff_buffers_tostring(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) { // XXX buffers_static doesnt constructs the correct string in this callback void *c = d->callback; void *u = d->user; RDiffUser du = {d, strdup ("")}; d->callback = &tostring; d->user = &du; r_diff_buffers_static (d, a, la, b, lb); d->callback = c; d->user = u; return du.str; } #endif #define diffHit() { \ const size_t i_hit = i - hit; \ int ra = la - i_hit; \ int rb = lb - i_hit; \ struct r_diff_op_t o = { \ .a_off = d->off_a+i-hit, .a_buf = a+i-hit, .a_len = R_MIN (hit, ra), \ .b_off = d->off_b+i-hit, .b_buf = b+i-hit, .b_len = R_MIN (hit, rb) \ }; \ d->callback (d, d->user, &o); \ } R_API int r_diff_buffers_static(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) { int i, len; int hit = 0; la = R_ABS (la); lb = R_ABS (lb); if (la != lb) { len = R_MIN (la, lb); R_LOG_INFO ("Buffer truncated to %d byte(s) (%d not compared)", len, R_ABS(lb - la)); } else { len = la; } for (i = 0; i < len; i++) { if (a[i] != b[i]) { hit++; } else { if (hit > 0) { diffHit (); hit = 0; } } } if (hit > 0) { diffHit (); } return 0; } R_API char *r_diff_buffers_unified(RDiff *d, const ut8 *a, int la, const ut8 *b, int lb) { char *fa = NULL; char *fb = NULL; int fd = r_file_mkstemp ("r_diff", &fa); int fe = r_file_mkstemp ("r_diff", &fb); if (fd == -1 || fe == -1) { R_LOG_ERROR ("Failed to create temporary files"); return NULL; } if (!fa || !fb) { R_LOG_ERROR ("fafb nul"); return NULL; } r_file_dump (fa, a, la, 0); r_file_dump (fb, b, lb, 0); #if 0 if (r_mem_is_printable (a, R_MIN (5, la))) { r_file_dump (fa, a, la, 0); r_file_dump (fb, b, lb, 0); } else { r_file_hexdump (fa, a, la, 0); r_file_hexdump (fb, b, lb, 0); } #endif char *err = NULL; char *out = NULL; int out_len; char *diff_cmdline = r_str_newf ("%s %s %s", d->diff_cmd, fa, fb); if (diff_cmdline) { (void)r_sys_cmd_str_full (diff_cmdline, NULL, 0, &out, &out_len, &err); free (diff_cmdline); } close (fd); close (fe); r_file_rm (fa); r_file_rm (fb); free (err); return out; } R_API int r_diff_buffers(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb) { return d->delta ? r_diff_buffers_delta (d, a, la, b, lb) : r_diff_buffers_static (d, a, la, b, lb); } // Eugene W. Myers' O(ND) diff algorithm // Returns edit distance with costs: insertion=1, deletion=1, no substitution R_API bool r_diff_buffers_distance_myers(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) { r_return_val_if_fail (a && b, false); const bool verbose = diff? diff->verbose: false; const ut32 length = la + lb; const ut8 *ea = a + la, *eb = b + lb; // Strip prefix for (; a < ea && b < eb && *a == *b; a++, b++) {} // Strip suffix for (; a < ea && b < eb && ea[-1] == eb[-1]; ea--, eb--) {} la = ea - a; lb = eb - b; ut32 *v0, *v; st64 m = (st64)la + lb, di = 0, low, high, i, x, y; if (m + 2 > SIZE_MAX / sizeof (st64) || !(v0 = malloc ((m + 2) * sizeof (ut32)))) { return false; } v = v0 + lb; v[1] = 0; for (di = 0; di <= m; di++) { low = -di + 2 * R_MAX (0, di - (st64)lb); high = di - 2 * R_MAX (0, di - (st64)la); for (i = low; i <= high; i += 2) { x = i == -di || (i != di && v[i-1] < v[i+1]) ? v[i+1] : v[i-1] + 1; y = x - i; while (x < la && y < lb && a[x] == b[y]) { x++; y++; } v[i] = x; if (x == la && y == lb) { goto out; } } if (verbose && di % 10000 == 0) { eprintf ("\rProcessing dist %" PFMT64d " of max %" PFMT64d "\r", (st64)di, (st64)m); } } out: if (verbose) { eprintf ("\n"); } free (v0); //Clean up output on loop exit (purely aesthetic) if (distance) { *distance = di; } if (similarity) { *similarity = length ? 1.0 - (double)di / length : 1.0; } return true; } R_API bool r_diff_buffers_distance_levenshtein(RDiff *diff, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) { r_return_val_if_fail (a && b, false); const bool verbose = diff ? diff->verbose : false; const ut32 length = R_MAX (la, lb); const ut8 *ea = a + la, *eb = b + lb, *t; ut32 *d, i, j; // Strip prefix for (; a < ea && b < eb && *a == *b; a++, b++) {} // Strip suffix for (; a < ea && b < eb && ea[-1] == eb[-1]; ea--, eb--) {} la = ea - a; lb = eb - b; if (la < lb) { i = la; la = lb; lb = i; t = a; a = b; b = t; } if (sizeof (ut32) > SIZE_MAX / (lb + 1) || !(d = malloc ((lb + 1) * sizeof (ut32)))) { return false; } for (i = 0; i <= lb; i++) { d[i] = i; } for (i = 0; i < la; i++) { ut32 ul = d[0]; d[0] = i + 1; for (j = 0; j < lb; j++) { ut32 u = d[j + 1]; d[j + 1] = a[i] == b[j] ? ul : R_MIN (ul, R_MIN (d[j], u)) + 1; ul = u; } if (verbose && i % 10000 == 0) { eprintf ("\rProcessing %" PFMT32u " of %" PFMT32u "\r", i, la); } } if (verbose) { eprintf ("\n"); } if (distance) { *distance = d[lb]; } if (similarity) { *similarity = length ? 1.0 - (double)d[lb] / length : 1.0; } free (d); return true; } R_API bool r_diff_buffers_distance(RDiff *d, const ut8 *a, ut32 la, const ut8 *b, ut32 lb, ut32 *distance, double *similarity) { if (d) { switch (d->type) { case 'm': return r_diff_buffers_distance_myers (d, a, la, b, lb, distance, similarity); case 'l': default: break; } } return r_diff_buffers_distance_levenshtein (d, a, la, b, lb, distance, similarity); } // Use Needleman–Wunsch to diffchar. // This is an O(mn) algo in both space and time. // Note that 64KB * 64KB * 2 = 8GB. // TODO Discard common prefix and suffix R_API RDiffChar *r_diffchar_new(const ut8 *a, const ut8 *b) { r_return_val_if_fail (a && b, NULL); RDiffChar *diffchar = R_NEW0 (RDiffChar); if (!diffchar) { return NULL; } const size_t len_a = strlen ((const char *)a); const size_t len_b = strlen ((const char *)b); const size_t len_long = len_a > len_b ? len_a : len_b; const size_t dim = len_long + 1; char *dup_a = malloc (len_long); char *dup_b = malloc (len_long); st16 *align_table = malloc (dim * dim * sizeof (st16)); ut8 *align_a = malloc (2 * len_long); ut8 *align_b = malloc (2 * len_long); if (!(dup_a && dup_b && align_table && align_a && align_b)) { free (dup_a); free (dup_b); free (align_table); free (align_a); free (align_b); free (diffchar); return NULL; } snprintf (dup_a, len_long, "%s", a); a = (const ut8*)dup_a; snprintf (dup_b, len_long, "%s", b); b = (const ut8*)dup_b; // Fill table size_t row, col; *align_table = 0; for (row = 1; row < dim; row++) { // TODO Clamping [ST16_MIN + 1, .] *(align_table + row) = *(align_table + row * dim) = -(st16)row; } const st16 match = 1; const st16 match_nl = 2; const st16 mismatch = -2; const st16 gap = -1; for (row = 1; row < dim; row++) { for (col = 1; col < dim; col++) { // TODO Clamping [ST16_MIN + 1, ST16_MAX] const ut8 a_ch = a[col - 1]; const ut8 b_ch = b[row - 1]; const st16 tl_score = *(align_table + (row - 1) * dim + col - 1) + (a_ch == b_ch ? (a_ch == '\n' ? match_nl : match) : mismatch); const st16 t_score = *(align_table + (row - 1) * dim + col) + gap; const st16 l_score = *(align_table + row * dim + col - 1) + gap; st16 score; if (tl_score >= t_score && tl_score >= l_score) { score = tl_score; } else if (t_score >= tl_score && t_score >= l_score) { score = t_score; } else { score = l_score; } *(align_table + row * dim + col) = score; } } #if 0 // Print table (Debug) char char_str[3] = { ' ' }; printf ("%4s ", char_str); for (col = 0; col < dim; col++) { if (col && a[col - 1] == '\n') { char_str[0] = '\\'; char_str[1] = 'n'; } else { char_str[0] = col ? a[col - 1] : ' '; char_str[1] = 0; } printf ("%4s ", char_str); } printf ("\n"); for (row = 0; row < dim; row++) { if (row && b[row - 1] == '\n') { char_str[0] = '\\'; char_str[1] = 'n'; } else { char_str[0] = row ? b[row - 1] : ' '; char_str[1] = 0; } printf ("%4s ", char_str); for (col = 0; col < dim; col++) { printf ("%4d ", *(align_table + row * dim + col)); } printf ("\n"); } #endif // Do alignment size_t idx_a = len_long - 1; size_t idx_b = len_long - 1; size_t idx_align = 2 * len_long - 1; size_t pos_row = dim - 1; size_t pos_col = dim - 1; while (pos_row || pos_col) { const st16 tl_score = (pos_row > 0 && pos_col > 0) ? *(align_table + (pos_row - 1) * dim + pos_col - 1) : ST16_MIN; const st16 t_score = pos_row > 0 ? *(align_table + (pos_row - 1) * dim + pos_col) : ST16_MIN; const st16 l_score = pos_col > 0 ? *(align_table + pos_row * dim + pos_col - 1) : ST16_MIN; const bool match = a[idx_a] == b[idx_b]; if (t_score >= l_score && (!match || t_score >= tl_score)) { align_a[idx_align] = 0; align_b[idx_align] = b[idx_b--]; idx_align--; pos_row--; } else if (l_score >= t_score && (!match || l_score >= tl_score)) { align_a[idx_align] = a[idx_a--]; align_b[idx_align] = 0; idx_align--; pos_col--; } else { align_a[idx_align] = a[idx_a--]; align_b[idx_align] = b[idx_b--]; idx_align--; pos_row--; pos_col--; } } idx_align++; const size_t start_align = idx_align; #if 0 // Print alignment (Debug) for (; idx_align < 2 * len_long; idx_align++) { const ut8 ch = align_a[idx_align]; if (align_b[idx_align] == '\n' && ch != '\n') { printf (ch ? " " : "-"); } if (ch == 0) { printf ("-"); } else if (ch == '\n') { printf ("\\n"); } else { printf ("%c", ch); } } printf ("\n"); for (idx_align = start_align; idx_align < 2 * len_long; idx_align++) { const ut8 ch = align_b[idx_align]; if (align_a[idx_align] == '\n' && ch != '\n') { printf (ch ? " " : "-"); } if (ch == 0) { printf ("-"); } else if (ch == '\n') { printf ("\\n"); } else { printf ("%c", ch); } } printf ("\n"); #endif diffchar->align_a = align_a; diffchar->align_b = align_b; diffchar->len_buf = len_long; diffchar->start_align = start_align; free (dup_a); free (dup_b); free (align_table); return diffchar; } typedef enum { R2R_ALIGN_MATCH, R2R_ALIGN_MISMATCH, R2R_ALIGN_TOP_GAP, R2R_ALIGN_BOTTOM_GAP } R2RCharAlignment; typedef enum { R2R_DIFF_MATCH, R2R_DIFF_DELETE, R2R_DIFF_INSERT } R2RPrintDiffMode; R_API void r_diffchar_print(RDiffChar *diffchar) { r_return_if_fail (diffchar); R2RPrintDiffMode cur_mode = R2R_DIFF_MATCH; R2RCharAlignment cur_align; size_t idx_align = diffchar->start_align; while (idx_align < 2 * diffchar->len_buf) { const ut8 a_ch = diffchar->align_a[idx_align]; const ut8 b_ch = diffchar->align_b[idx_align]; if (a_ch && !b_ch) { cur_align = R2R_ALIGN_BOTTOM_GAP; } else if (!a_ch && b_ch) { cur_align = R2R_ALIGN_TOP_GAP; } else if (a_ch != b_ch) { R_LOG_ERROR ("Internal mismatch detected!"); cur_align = R2R_ALIGN_MISMATCH; } else { cur_align = R2R_ALIGN_MATCH; } if (cur_mode == R2R_DIFF_MATCH) { if (cur_align == R2R_ALIGN_MATCH) { if (a_ch) { printf ("%c", a_ch); } } else if (cur_align == R2R_ALIGN_BOTTOM_GAP) { printf (a_ch == '\n' ? "%c" Color_HLDELETE : Color_HLDELETE "%c", a_ch); cur_mode = R2R_DIFF_DELETE; } else if (cur_align == R2R_ALIGN_TOP_GAP) { printf (b_ch == '\n' ? "%c" Color_HLINSERT : Color_HLINSERT "%c", b_ch); cur_mode = R2R_DIFF_INSERT; } } else if (cur_mode == R2R_DIFF_DELETE) { if (cur_align == R2R_ALIGN_MATCH) { printf (Color_RESET); if (a_ch) { printf ("%c", a_ch); } cur_mode = R2R_DIFF_MATCH; } else if (cur_align == R2R_ALIGN_BOTTOM_GAP) { printf (a_ch == '\n' ? Color_RESET "%c" Color_HLDELETE : "%c", a_ch); } else if (cur_align == R2R_ALIGN_TOP_GAP) { printf (b_ch == '\n' ? Color_RESET "%c" Color_HLINSERT : Color_HLINSERT "%c", b_ch); cur_mode = R2R_DIFF_INSERT; } } else if (cur_mode == R2R_DIFF_INSERT) { if (cur_align == R2R_ALIGN_MATCH) { printf (Color_RESET); if (a_ch) { printf ("%c", a_ch); } cur_mode = R2R_DIFF_MATCH; } else if (cur_align == R2R_ALIGN_BOTTOM_GAP) { printf (a_ch == '\n' ? Color_RESET "%c" Color_HLDELETE : Color_HLDELETE "%c", a_ch); cur_mode = R2R_DIFF_DELETE; } else if (cur_align == R2R_ALIGN_TOP_GAP) { printf (b_ch == '\n' ? Color_RESET "%c" Color_HLINSERT : "%c", b_ch); } } idx_align++; } printf (Color_RESET "\n"); } R_API void r_diffchar_free(RDiffChar *diffchar) { if (diffchar) { free ((ut8 *)diffchar->align_a); free ((ut8 *)diffchar->align_b); free (diffchar); } } static st32 r_diff_levenshtein_nopath(RLevBuf *bufa, RLevBuf *bufb, ut32 maxdst, RLevMatches levdiff, size_t skip, ut32 alen, ut32 blen) { r_return_val_if_fail (bufa && bufb && bufa->buf && bufb->buf, -1); r_return_val_if_fail (blen >= alen && alen > 0, -1); // max distance is at most length of longer input, or provided by user ut32 origdst = maxdst = R_MIN (maxdst, blen); // two rows Levrow *matrix = R_NEWS0 (Levrow, 2); if (!matrix) { return -1; } Levrow *row = matrix; Levrow *prev_row = matrix + 1; // must allocate for largest row, not the first row, so don't use // lev_row_init row->changes = R_NEWS (ut32, 2 * maxdst + 1); prev_row->changes = R_NEWS (ut32, 2 * maxdst + 1); if (!prev_row->changes || !row->changes) { lev_matrix_free (matrix, alen + 1); return -1; } ut32 ldelta = blen - alen; if (ldelta > maxdst) { lev_matrix_free (matrix, alen + 1); return ST32_MAX; } lev_row_adjust (row, maxdst, 0, blen, ldelta); size_t i; for (i = row->start; i <= row->end; i++) { row->changes[i] = i; } // do the rest of the rows ut32 oldmin = 0; // minimum cell in row 0 for (i = 1; i <= alen; i++) { // loop through all rows // switch rows if (row == matrix) { row = prev_row; prev_row = matrix; } else { prev_row = row; row = matrix; } lev_row_adjust (row, maxdst, i, blen, ldelta); ut32 start = row->start; ut32 udel = UT32_MAX; if (start == 0) { row->changes[0] = udel = i; start++; } ut32 newmin = UT32_MAX; ut32 sub = lev_get_val (prev_row, start - 1); ut32 j; for (j = start; j <= row->end; j++) { ut32 add = lev_get_val (prev_row, j); ut32 ans = R_MIN (udel, add) + 1; if (ans >= sub) { // on rare occassions, when add/del is obviously better then // sub, we can skip levdiff call int d = levdiff (bufa, bufb, i + skip - 1, j + skip - 1)? 1: 0; ans = R_MIN (ans, sub + d); } sub = add; udel = ans; row->changes[j - row->start] = ans; if (ans < newmin) { newmin = ans; } } if (newmin > oldmin) { if (maxdst == 0) { // provided bad maxdst lev_matrix_free (matrix, 2); return ST32_MAX; } // if smallest element of this row is larger then the smallest // element of previous row a change must occur and thus the // distance for the rest of the alg can be reduced. oldmin = newmin; maxdst--; } } st32 ret = lev_get_val (row, row->end); if (ret > origdst) { ret = ST32_MAX; } lev_matrix_free (matrix, 2); return ret; } /** * \brief Return Levenshtein distance and put array of changes, of unknown * lenght, in chgs * \param bufa Structure to represent starting buffer * \param bufb Structure to represent the buffer to reach * \param maxdst Max Levenshtein distance need, send UT32_MAX if unknown. * \param levdiff Function pointer returning true when there is a difference. * \param chgs Returned array of changes to get from bufa to bufb * * Perform a Levenshtein diff on two buffers and obtain a RLevOp array of * changes. The length of the RLevOp array is NOT provided, it is terminated by * the LEVEND value. Providing a good maxdst value will increase performance of * this algorithm. If computed maxdst is exceeded ST32_MAX will be returned and * chgs will be left NULL. The chgs value must point to a NULL pointer. The * caller must free *chgs. */ R_API st32 r_diff_levenshtein_path(RLevBuf *bufa, RLevBuf *bufb, ut32 maxdst, RLevMatches levdiff, RLevOp **chgs) { r_return_val_if_fail (bufa && bufb && bufa->buf && bufb->buf, -1); r_return_val_if_fail (!chgs || !*chgs, -1); // if chgs then it must point at NULL // force buffer b to be longer, this will invert add/del resulsts bool invert = false; if (bufb->len < bufa->len) { invert = true; RLevBuf *x = bufa; bufa = bufb; bufb = x; } r_return_val_if_fail (bufb->len < UT32_MAX, -1); ut32 ldelta = bufb->len - bufa->len; if (ldelta > maxdst) { return ST32_MAX; } // Strip start as long as bytes don't diff size_t skip; ut32 alen = bufa->len; ut32 blen = bufb->len; for (skip = 0; skip < alen && !levdiff (bufa, bufb, skip, skip); skip++) { } // strip suffix as long as bytes don't diff size_t i; for (i = 0; alen > skip && !levdiff (bufa, bufb, alen - 1, blen - 1); alen--, blen--, i++) {} alen -= skip; blen -= skip; if (alen == 0) { if (chgs) { RLevOp *c = R_NEWS (RLevOp, skip + i + blen + 1); if (!c) { return -1; } *chgs = c; lev_fill_changes (c, LEVNOP, skip); c += skip; lev_fill_changes (c, invert? LEVDEL: LEVADD, blen); c += blen; lev_fill_changes (c, LEVNOP, i); c += i; *c = LEVEND; } return blen; } if (!chgs) { return r_diff_levenshtein_nopath (bufa, bufb, maxdst, levdiff, skip, alen, blen); } // max distance is at most length of longer input, or provided by user ut32 origdst = maxdst = R_MIN (maxdst, blen); // alloc array of rows Levrow *matrix = R_NEWS0 (Levrow, alen + 1); if (!matrix) { return -1; } // init row 0 Levrow *row = lev_row_init (matrix, maxdst, 0, blen, ldelta); if (!row) { lev_matrix_free (matrix, alen + 1); return -1; } for (i = row->start; i <= row->end; i++) { row->changes[i] = i; } // do the rest of the rows ut32 oldmin = 0; // minimum cell in row 0 Levrow *prev_row; for (i = 1; i <= alen; i++) { // loop through all rows prev_row = row; if ((row = lev_row_init (matrix, maxdst, i, blen, ldelta)) == NULL) { lev_matrix_free (matrix, alen + 1); return -1; } ut32 start = row->start; ut32 udel = UT32_MAX; if (start == 0) { row->changes[0] = udel = i; start++; } ut32 newmin = UT32_MAX; ut32 sub = lev_get_val (prev_row, start - 1); ut32 j; for (j = start; j <= row->end; j++) { ut32 add = lev_get_val (prev_row, j); ut32 ans = R_MIN (udel, add) + 1; if (ans >= sub) { // on rare occassions, when add/del is obviously better then // sub, we can skip levdiff call int d = levdiff (bufa, bufb, i + skip - 1, j + skip - 1)? 1: 0; ans = R_MIN (ans, sub + d); } sub = add; udel = ans; row->changes[j - row->start] = ans; if (ans < newmin) { newmin = ans; } } if (newmin > oldmin) { if (maxdst == 0) { // provided bad maxdst lev_matrix_free (matrix, alen + 1); return ST32_MAX; } // if smallest element of this row is larger then the smallest // element of previous row a change must occur and thus the // distance for the rest of the alg can be reduced. oldmin = newmin; maxdst--; } } st32 ret = lev_get_val (row, row->end); if (ret > origdst) { // can happen when off by one lev_matrix_free (matrix, alen + 1); return ST32_MAX; } #if 0 { // for debugging matrix size_t total = 0; for (i = 0; i <= alen; i++) { Levrow *bow = matrix + i; ut32 j; printf (" "); for (j = 0; j <= blen; j++) { ut32 val = lev_get_val (bow, j); if (val >= UT32_MAX - 1) { printf (" .."); } else { printf (" %02x", val); } } total += bow->end + 1 - bow->start; printf (" buflen: %d\n", bow->end + 1 - bow->start); } printf ("\n%ld matrix cells allocated\n", total); } #endif RLevOp *mtxpath = NULL; st32 chg_size = lev_parse_matrix (matrix, alen + 1, invert, &mtxpath); lev_matrix_free (matrix, alen + 1); if (chg_size > 0 && mtxpath) { ut32 tail = bufb->len - skip - blen; RLevOp *c = R_NEWS (RLevOp, skip + chg_size + tail + 1); *chgs = c; if (c) { lev_fill_changes (c, LEVNOP, skip); c += skip; while (chg_size > 0) { chg_size--; *c = mtxpath[chg_size]; c++; } lev_fill_changes (c, LEVNOP, tail); c += tail; *c = LEVEND; } } free (mtxpath); return ret; }