FFmpeg/libavutil/lls.c

124 lines
3.2 KiB
C
Raw Normal View History

/*
* linear least squares model
*
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* linear least squares model
*/
#include <math.h>
#include <string.h>
#include "attributes.h"
#include "internal.h"
#include "version.h"
#include "lls.h"
static void update_lls(LLSModel *m, const double *var)
{
int i, j;
for (i = 0; i <= m->indep_count; i++) {
for (j = i; j <= m->indep_count; j++) {
m->covariance[i][j] += var[i] * var[j];
}
}
}
void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order)
{
int i, j, k;
double (*factor)[MAX_VARS_ALIGN] = (void *) &m->covariance[1][0];
double (*covar) [MAX_VARS_ALIGN] = (void *) &m->covariance[1][1];
double *covar_y = m->covariance[0];
int count = m->indep_count;
for (i = 0; i < count; i++) {
for (j = i; j < count; j++) {
double sum = covar[i][j];
avutil/lls: speed up performance of solve_lls This is a trivial rewrite of the loops that results in better prefetching and associated cache efficiency. Essentially, the problem is that modern prefetching logic is based on finite state Markov memory, a reasonable assumption that is used elsewhere in CPU's in for instance branch predictors. Surrounding loops all iterate forward through the array, making the predictor think of prefetching in the forward direction, but the intermediate loop is unnecessarily in the backward direction. Speedup is nontrivial. Benchmarks obtained by 10^6 iterations within solve_lls, with START/STOP_TIMER. File is tests/data/fate/flac-16-lpc-cholesky.err. Hardware: x86-64, Haswell, GNU/Linux. new: 17291 decicycles in solve_lls, 2096706 runs, 446 skips 17255 decicycles in solve_lls, 4193657 runs, 647 skips 17231 decicycles in solve_lls, 8384997 runs, 3611 skips 17189 decicycles in solve_lls,16771010 runs, 6206 skips 17132 decicycles in solve_lls,33544757 runs, 9675 skips 17092 decicycles in solve_lls,67092404 runs, 16460 skips 17058 decicycles in solve_lls,134188213 runs, 29515 skips old: 18009 decicycles in solve_lls, 2096665 runs, 487 skips 17805 decicycles in solve_lls, 4193320 runs, 984 skips 17779 decicycles in solve_lls, 8386855 runs, 1753 skips 18289 decicycles in solve_lls,16774280 runs, 2936 skips 18158 decicycles in solve_lls,33548104 runs, 6328 skips 18420 decicycles in solve_lls,67091793 runs, 17071 skips 18310 decicycles in solve_lls,134187219 runs, 30509 skips Reviewed-by: Michael Niedermayer <michael@niedermayer.cc> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
2015-11-24 00:05:54 +00:00
for (k = 0; k <= i-1; k++)
sum -= factor[i][k] * factor[j][k];
if (i == j) {
if (sum < threshold)
sum = 1.0;
factor[i][i] = sqrt(sum);
} else {
factor[j][i] = sum / factor[i][i];
}
}
}
for (i = 0; i < count; i++) {
double sum = covar_y[i + 1];
avutil/lls: speed up performance of solve_lls This is a trivial rewrite of the loops that results in better prefetching and associated cache efficiency. Essentially, the problem is that modern prefetching logic is based on finite state Markov memory, a reasonable assumption that is used elsewhere in CPU's in for instance branch predictors. Surrounding loops all iterate forward through the array, making the predictor think of prefetching in the forward direction, but the intermediate loop is unnecessarily in the backward direction. Speedup is nontrivial. Benchmarks obtained by 10^6 iterations within solve_lls, with START/STOP_TIMER. File is tests/data/fate/flac-16-lpc-cholesky.err. Hardware: x86-64, Haswell, GNU/Linux. new: 17291 decicycles in solve_lls, 2096706 runs, 446 skips 17255 decicycles in solve_lls, 4193657 runs, 647 skips 17231 decicycles in solve_lls, 8384997 runs, 3611 skips 17189 decicycles in solve_lls,16771010 runs, 6206 skips 17132 decicycles in solve_lls,33544757 runs, 9675 skips 17092 decicycles in solve_lls,67092404 runs, 16460 skips 17058 decicycles in solve_lls,134188213 runs, 29515 skips old: 18009 decicycles in solve_lls, 2096665 runs, 487 skips 17805 decicycles in solve_lls, 4193320 runs, 984 skips 17779 decicycles in solve_lls, 8386855 runs, 1753 skips 18289 decicycles in solve_lls,16774280 runs, 2936 skips 18158 decicycles in solve_lls,33548104 runs, 6328 skips 18420 decicycles in solve_lls,67091793 runs, 17071 skips 18310 decicycles in solve_lls,134187219 runs, 30509 skips Reviewed-by: Michael Niedermayer <michael@niedermayer.cc> Signed-off-by: Ganesh Ajjanagadde <gajjanagadde@gmail.com>
2015-11-24 00:05:54 +00:00
for (k = 0; k <= i-1; k++)
sum -= factor[i][k] * m->coeff[0][k];
m->coeff[0][i] = sum / factor[i][i];
}
for (j = count - 1; j >= min_order; j--) {
for (i = j; i >= 0; i--) {
double sum = m->coeff[0][i];
for (k = i + 1; k <= j; k++)
sum -= factor[k][i] * m->coeff[j][k];
m->coeff[j][i] = sum / factor[i][i];
}
m->variance[j] = covar_y[0];
for (i = 0; i <= j; i++) {
double sum = m->coeff[j][i] * covar[i][i] - 2 * covar_y[i + 1];
for (k = 0; k < i; k++)
sum += 2 * m->coeff[j][k] * covar[k][i];
m->variance[j] += m->coeff[j][i] * sum;
}
}
}
static double evaluate_lls(LLSModel *m, const double *param, int order)
{
int i;
double out = 0;
for (i = 0; i <= order; i++)
out += param[i] * m->coeff[order][i];
return out;
}
av_cold void avpriv_init_lls(LLSModel *m, int indep_count)
{
memset(m, 0, sizeof(LLSModel));
m->indep_count = indep_count;
m->update_lls = update_lls;
m->evaluate_lls = evaluate_lls;
if (ARCH_X86)
ff_init_lls_x86(m);
}