Bug 1511670 - Adding required files for encoding webp. r=aosmond

Differential Revision: https://phabricator.services.mozilla.com/D130547
This commit is contained in:
Ulrich-Matthias Schäfer 2021-11-06 13:09:06 +00:00
parent 890026f19b
commit 2376855c25
51 changed files with 25825 additions and 62 deletions

View File

@ -10,6 +10,8 @@ with Files('**'):
EXPORTS.webp += [
'src/webp/decode.h',
'src/webp/demux.h',
'src/webp/encode.h',
'src/webp/format_constants.h',
'src/webp/mux_types.h',
'src/webp/types.h',
]
@ -18,6 +20,7 @@ DIRS += [
'src/dec',
'src/demux',
'src/dsp',
'src/enc',
'src/moz',
'src/utils',
]

View File

@ -0,0 +1,411 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
const uint16_t VP8EntropyCost[256] = {
1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
1178, 1152, 1110, 1076, 1061, 1024, 1024, 992, 968, 951,
939, 911, 896, 878, 871, 854, 838, 820, 811, 794,
786, 768, 768, 752, 740, 732, 720, 709, 704, 690,
683, 672, 666, 655, 647, 640, 631, 622, 615, 607,
598, 592, 586, 576, 572, 564, 559, 555, 547, 541,
534, 528, 522, 512, 512, 504, 500, 494, 488, 483,
477, 473, 467, 461, 458, 452, 448, 443, 438, 434,
427, 424, 419, 415, 410, 406, 403, 399, 394, 390,
384, 384, 377, 374, 370, 366, 362, 359, 355, 351,
347, 342, 342, 336, 333, 330, 326, 323, 320, 316,
312, 308, 305, 302, 299, 296, 293, 288, 287, 283,
280, 277, 274, 272, 268, 266, 262, 256, 256, 256,
251, 248, 245, 242, 240, 237, 234, 232, 228, 226,
223, 221, 218, 216, 214, 211, 208, 205, 203, 201,
198, 196, 192, 191, 188, 187, 183, 181, 179, 176,
175, 171, 171, 168, 165, 163, 160, 159, 156, 154,
152, 150, 148, 146, 144, 142, 139, 138, 135, 133,
131, 128, 128, 125, 123, 121, 119, 117, 115, 113,
111, 110, 107, 105, 103, 102, 100, 98, 96, 94,
92, 91, 89, 86, 86, 83, 82, 80, 77, 76,
74, 73, 71, 69, 67, 66, 64, 63, 61, 59,
57, 55, 54, 52, 51, 49, 47, 46, 44, 43,
41, 40, 38, 36, 35, 33, 32, 30, 29, 27,
25, 24, 22, 21, 19, 18, 16, 15, 13, 12,
10, 9, 7, 6, 4, 3
};
//------------------------------------------------------------------------------
// Level cost tables
// fixed costs for coding levels, deduce from the coding tree.
// This is only the part that doesn't depend on the probability state.
const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
0, 256, 256, 256, 256, 432, 618, 630,
731, 640, 640, 828, 901, 948, 1021, 1101,
1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
};
//------------------------------------------------------------------------------
// Tables for level coding
const uint8_t VP8EncBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // sentinel
};
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
for (; n < res->last; ++n) {
const int v = abs(res->coeffs[n]);
const int ctx = (v >= 2) ? 2 : v;
cost += VP8LevelCost(t, v);
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs_C(const int16_t* const coeffs,
VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
for (n = 15; n >= 0; --n) {
if (coeffs[n]) {
res->last = n;
break;
}
}
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// init function
VP8GetResidualCostFunc VP8GetResidualCost;
VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
extern void VP8EncDspCostInitMIPS32(void);
extern void VP8EncDspCostInitMIPSdspR2(void);
extern void VP8EncDspCostInitSSE2(void);
extern void VP8EncDspCostInitNEON(void);
WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
VP8GetResidualCost = GetResidualCost_C;
VP8SetResidualCoeffs = SetResidualCoeffs_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspCostInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspCostInitMIPSdspR2();
}
#endif
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspCostInitSSE2();
}
#endif
#if defined(WEBP_HAVE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspCostInitNEON();
}
#endif
}
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,154 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "src/enc/cost_enc.h"
static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"sll %[temp0], %[n], 1 \n\t"
"blez %[temp1], 2f \n\t"
" addu %[res_coeffs], %[res_coeffs], %[temp0] \n\t"
"1: \n\t"
"lh %[v_reg], 0(%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"negu %[temp0], %[v_reg] \n\t"
"slti %[temp1], %[v_reg], 0 \n\t"
"movn %[v_reg], %[temp0], %[temp1] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"addu %[temp1], %[temp1], %[VP8LevelFixedCosts] \n\t"
"lhu %[temp1], 0(%[temp1]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"addu %[v_reg], %[v_reg], %[t] \n\t"
"lhu %[temp0], 0(%[v_reg]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"addiu %[res_coeffs], %[res_coeffs], 2 \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[p_coeffs], %[p_coeffs], 28 \n\t"
"li %[n], 15 \n\t"
"li %[temp2], -1 \n\t"
"0: \n\t"
"ulw %[temp0], 0(%[p_coeffs]) \n\t"
"beqz %[temp0], 1f \n\t"
#if defined(WORDS_BIGENDIAN)
" sll %[temp1], %[temp0], 16 \n\t"
#else
" srl %[temp1], %[temp0], 16 \n\t"
#endif
"addiu %[n1], %[n], -1 \n\t"
"movz %[temp0], %[n1], %[temp1] \n\t"
"movn %[temp0], %[n], %[temp1] \n\t"
"j 2f \n\t"
" addiu %[temp2], %[temp0], 0 \n\t"
"1: \n\t"
"addiu %[n], %[n], -2 \n\t"
"bgtz %[n], 0b \n\t"
" addiu %[p_coeffs], %[p_coeffs], -4 \n\t"
"2: \n\t"
".set pop \n\t"
: [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[n]"=&r"(n), [n1]"=&r"(n1)
:
: "memory"
);
res->last = temp2;
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
VP8GetResidualCost = GetResidualCost_MIPS32;
VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,107 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "src/enc/cost_enc.h"
static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"blez %[temp1], 2f \n\t"
" nop \n\t"
"1: \n\t"
"sll %[temp0], %[n], 1 \n\t"
"lhx %[v_reg], %[temp0](%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"absq_s.w %[v_reg], %[v_reg] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"lhx %[temp1], %[temp1](%[VP8LevelFixedCosts]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"lhx %[temp0], %[v_reg](%[t]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
VP8GetResidualCost = GetResidualCost_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,122 @@
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of cost functions
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include "src/dsp/neon.h"
#include "src/enc/cost_enc.h"
static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16 };
static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
VP8Residual* const res) {
const int16x8_t minus_one = vdupq_n_s16(-1);
const int16x8_t coeffs_0 = vld1q_s16(coeffs);
const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
#ifdef __aarch64__
res->last = vmaxvq_u8(masked) - 1;
#else
const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
const uint16x4_t eob_16x4 =
vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
uint32x2_t eob_32x2 =
vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
--res->last;
#endif // __aarch64__
res->coeffs = coeffs;
}
static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
{ // precompute clamped levels and contexts, packed to 8b.
const uint8x16_t kCst2 = vdupq_n_u8(2);
const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
const int16x8_t c0 = vld1q_s16(res->coeffs);
const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
const uint8x16_t G = vminq_u8(F, kCst2); // context = 0,1,2
const uint8x16_t H = vminq_u8(F, kCst67); // clamp_level in [0..67]
vst1q_u8(ctxs, G);
vst1q_u8(levels, H);
vst1q_u16(abs_levels, E0);
vst1q_u16(abs_levels + 8, E1);
}
for (; n < res->last; ++n) {
const int ctx = ctxs[n];
const int level = levels[n];
const int flevel = abs_levels[n]; // full level
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int level = levels[n];
const int flevel = abs_levels[n];
assert(flevel != 0);
cost += VP8LevelFixedCosts[flevel] + t[level];
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = ctxs[n];
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
VP8GetResidualCost = GetResidualCost_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,119 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of cost functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_packs_epi16(c0, c1);
const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
// Get the comparison results as a bitmask into 16bits. Negate the mask to get
// the position of entries that are not equal to zero. We don't need to mask
// out least significant bits according to res->first, since coeffs[0] is 0
// if res->first > 0.
const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
// The position of the most significant non-zero bit indicates the position of
// the last non-zero value.
assert(res->first == 0 || coeffs[0] == 0);
res->last = mask ? BitsLog2Floor(mask) : -1;
res->coeffs = coeffs;
}
static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
{ // precompute clamped levels and contexts, packed to 8b.
const __m128i zero = _mm_setzero_si128();
const __m128i kCst2 = _mm_set1_epi8(2);
const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
const __m128i D0 = _mm_sub_epi16(zero, c0);
const __m128i D1 = _mm_sub_epi16(zero, c1);
const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b
const __m128i E1 = _mm_max_epi16(c1, D1);
const __m128i F = _mm_packs_epi16(E0, E1);
const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2
const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67]
_mm_storeu_si128((__m128i*)&ctxs[0], G);
_mm_storeu_si128((__m128i*)&levels[0], H);
_mm_storeu_si128((__m128i*)&abs_levels[0], E0);
_mm_storeu_si128((__m128i*)&abs_levels[8], E1);
}
for (; n < res->last; ++n) {
const int ctx = ctxs[n];
const int level = levels[n];
const int flevel = abs_levels[n]; // full level
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int level = levels[n];
const int flevel = abs_levels[n];
assert(flevel != 0);
cost += VP8LevelFixedCosts[flevel] + t[level];
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = ctxs[n];
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
VP8GetResidualCost = GetResidualCost_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
#endif // WEBP_USE_SSE2

830
media/libwebp/src/dsp/enc.c Normal file
View File

@ -0,0 +1,830 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
const int VP8DspScan[16 + 4 + 4] = {
// Luma
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
// general-purpose util function
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
VP8Histogram* const histo) {
int max_value = 0, last_non_zero = 1;
int k;
for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
const int value = distribution[k];
if (value > 0) {
if (value > max_value) max_value = value;
last_non_zero = k;
}
}
histo->max_value = max_value;
histo->last_non_zero = last_non_zero;
}
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int k;
int16_t out[16];
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 3;
const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
++distribution[clipped_value];
}
}
VP8SetHistogramData(distribution, histo);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = clip_8b(i);
}
tables_ok = 1;
}
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#if !WEBP_NEON_OMIT_C_CODE
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8];
const int b = in[0] - in[8];
const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp += 4;
in++;
}
tmp = C;
for (i = 0; i < 4; ++i) { // horizontal pass
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
STORE(0, i, a + d);
STORE(1, i, b + c);
STORE(2, i, b - c);
STORE(3, i, a - d);
tmp++;
}
}
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
const int d1 = src[1] - ref[1];
const int d2 = src[2] - ref[2];
const int d3 = src[3] - ref[3];
const int a0 = (d0 + d3); // 10b [-510,510]
const int a1 = (d1 + d2);
const int a2 = (d1 - d2);
const int a3 = (d0 - d3);
tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
tmp[2 + i * 4] = (a0 - a1) * 8;
tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
const int a3 = (tmp[0 + i] - tmp[12 + i]);
out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
out[8 + i] = (a0 - a1 + 7) >> 4;
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
const int a1 = (in[1 * 16] + in[3 * 16]);
const int a2 = (in[1 * 16] - in[3 * 16]);
const int a3 = (in[0 * 16] - in[2 * 16]);
tmp[0 + i * 4] = a0 + a1; // 14b
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
const int b0 = a0 + a1; // 16b
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
out[ 0 + i] = b0 >> 1; // 15b
out[ 4 + i] = b1 >> 1;
out[ 8 + i] = b2 >> 1;
out[12 + i] = b3 >> 1;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
//------------------------------------------------------------------------------
// Intra predictions
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, value, size);
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
} else {
Fill(dst, 127, size);
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
if (left != NULL) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, left[j], size);
}
} else {
Fill(dst, 129, size);
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
int y;
if (left != NULL) {
if (top != NULL) {
const uint8_t* const clip = clip1 + 255 - left[-1];
for (y = 0; y < size; ++y) {
const uint8_t* const clip_table = clip + left[y];
int x;
for (x = 0; x < size; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
} else {
HorizontalPred(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
// is equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred(dst, top, size);
} else {
Fill(dst, 129, size);
}
}
}
static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top,
int size, int round, int shift) {
int DC = 0;
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) DC += top[j];
if (left != NULL) { // top and left present
for (j = 0; j < size; ++j) DC += left[j];
} else { // top, but no left
DC += DC;
}
DC = (DC + round) >> shift;
} else if (left != NULL) { // left but no top
for (j = 0; j < size; ++j) DC += left[j];
DC += DC;
DC = (DC + round) >> shift;
} else { // no top, no left, nothing.
DC = 0x80;
}
Fill(dst, DC, size);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// luma 4x4 prediction
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
AVG3(top[ 1], top[2], top[3]),
AVG3(top[ 2], top[3], top[4])
};
int i;
for (i = 0; i < 4; ++i) {
memcpy(dst + i * BPS, vals, 4);
}
}
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 3) = AVG3(J, K, L);
DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) {
const uint8_t* const clip_table = clip + top[-2 - y];
for (x = 0; x < 4; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
}
#undef DST
#undef AVG3
#undef AVG2
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
int y, x;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
const int diff = (int)a[x] - b[x];
count += diff * diff;
}
a += BPS;
b += BPS;
}
return count;
}
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
avg += ref[x + y * BPS];
}
}
dc[k] = avg;
ref += 4; // go to next 4x4 block.
}
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* in, const uint16_t* w) {
int sum = 0;
int tmp[16];
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = in[0] + in[2];
const int a1 = in[1] + in[3];
const int a2 = in[1] - in[3];
const int a3 = in[0] - in[2];
tmp[0 + i * 4] = a0 + a1;
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
const int a0 = tmp[0 + i] + tmp[8 + i];
const int a1 = tmp[4 + i] + tmp[12+ i];
const int a2 = tmp[4 + i] - tmp[12+ i];
const int a3 = tmp[0 + i] - tmp[8 + i];
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
sum += w[ 0] * abs(b0);
sum += w[ 4] * abs(b1);
sum += w[ 8] * abs(b2);
sum += w[12] * abs(b3);
}
return sum;
}
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
//
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// Simple quantization
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
const int j = kZigzag[n];
const int sign = (in[j] < 0);
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
if (coeff > mtx->zthresh_[j]) {
const uint32_t Q = mtx->q_[j];
const uint32_t iQ = mtx->iq_[j];
const uint32_t B = mtx->bias_[j];
int level = QUANTDIV(coeff, iQ, B);
if (level > MAX_LEVEL) level = MAX_LEVEL;
if (sign) level = -level;
in[j] = level * (int)Q;
out[n] = level;
if (level) last = n;
} else {
out[n] = 0;
in[j] = 0;
}
}
return (last >= 0);
}
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
int y;
for (y = 0; y < h; ++y) {
memcpy(dst, src, w);
src += BPS;
dst += BPS;
}
}
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// Initialization
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8Fdct VP8FTransform2;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8MeanMetric VP8Mean16x4;
VP8QuantizeBlock VP8EncQuantizeBlock;
VP8Quantize2Blocks VP8EncQuantize2Blocks;
VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy16x8;
extern void VP8EncDspInitSSE2(void);
extern void VP8EncDspInitSSE41(void);
extern void VP8EncDspInitNEON(void);
extern void VP8EncDspInitMIPS32(void);
extern void VP8EncDspInitMIPSdspR2(void);
extern void VP8EncDspInitMSA(void);
WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
VP8DspInit(); // common inverse transforms
InitTables();
// default C implementations
#if !WEBP_NEON_OMIT_C_CODE
VP8ITransform = ITransform_C;
VP8FTransform = FTransform_C;
VP8FTransformWHT = FTransformWHT_C;
VP8TDisto4x4 = Disto4x4_C;
VP8TDisto16x16 = Disto16x16_C;
VP8CollectHistogram = CollectHistogram_C;
VP8SSE16x16 = SSE16x16_C;
VP8SSE16x8 = SSE16x8_C;
VP8SSE8x8 = SSE8x8_C;
VP8SSE4x4 = SSE4x4_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8EncDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8EncDspInitMSA();
}
#endif
}
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
}
#endif
assert(VP8ITransform != NULL);
assert(VP8FTransform != NULL);
assert(VP8FTransformWHT != NULL);
assert(VP8TDisto4x4 != NULL);
assert(VP8TDisto16x16 != NULL);
assert(VP8CollectHistogram != NULL);
assert(VP8SSE16x16 != NULL);
assert(VP8SSE16x8 != NULL);
assert(VP8SSE8x8 != NULL);
assert(VP8SSE4x4 != NULL);
assert(VP8EncQuantizeBlock != NULL);
assert(VP8EncQuantize2Blocks != NULL);
assert(VP8FTransform2 != NULL);
assert(VP8EncPredLuma4 != NULL);
assert(VP8EncPredLuma16 != NULL);
assert(VP8EncPredChroma8 != NULL);
assert(VP8Mean16x4 != NULL);
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
}

View File

@ -0,0 +1,677 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of speed-critical encoding functions.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "src/dsp/mips_macro.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/cost_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
// macro for one vertical pass in ITransformOne
// MUL macro inlined
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to load from in buffer
// TEMP0..TEMP3 - registers for corresponding tmp elements
// TEMP4..TEMP5 - temporary registers
#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
"lh %[temp16], " #A "(%[temp20]) \n\t" \
"lh %[temp18], " #B "(%[temp20]) \n\t" \
"lh %[temp17], " #C "(%[temp20]) \n\t" \
"lh %[temp19], " #D "(%[temp20]) \n\t" \
"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
"subu %[temp16], %[temp16], %[temp18] \n\t" \
"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
"mul %[temp18], %[temp19], %[kC1] \n\t" \
"mul %[temp17], %[temp17], %[kC1] \n\t" \
"mul %[temp19], %[temp19], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
"sra %[temp18], %[temp18], 16 \n\n" \
"sra %[temp17], %[temp17], 16 \n\n" \
"sra %[temp19], %[temp19], 16 \n\n" \
"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
"addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
"addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
// macro for one horizontal pass in ITransformOne
// MUL and STORE macros inlined
// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from ref and store to dst buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"lw %[temp20], 0(%[args]) \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
"addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
"addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
"slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
"slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
"slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
"slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
"movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
"movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
"movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
"movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
"addiu %[temp20], $zero, 255 \n\t" \
"slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
"slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
"slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
"slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
"movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
"movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
"lw %[temp16], 8(%[args]) \n\t" \
"movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
"movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
"sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
const int16_t* in,
uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
__asm__ volatile(
"lw %[temp20], 4(%[args]) \n\t"
VERTICAL_PASS(0, 16, 8, 24, temp4, temp0, temp1, temp2, temp3)
VERTICAL_PASS(2, 18, 10, 26, temp8, temp4, temp5, temp6, temp7)
VERTICAL_PASS(4, 20, 12, 28, temp12, temp8, temp9, temp10, temp11)
VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
HORIZONTAL_PASS(0, temp0, temp4, temp8, temp12)
HORIZONTAL_PASS(1, temp1, temp5, temp9, temp13)
HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
: "memory", "hi", "lo"
);
}
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
}
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
// macro for one pass through for loop in QuantizeBlock
// QUANTDIV macro inlined
// J - offset in bytes (kZigzag[n] * 2)
// K - offset in bytes (kZigzag[n] * 4)
// N - offset in bytes (n * 2)
#define QUANTIZE_ONE(J, K, N) \
"lh %[temp0], " #J "(%[ppin]) \n\t" \
"lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
"lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
"sra %[sign], %[temp0], 15 \n\t" \
"xor %[coeff], %[temp0], %[sign] \n\t" \
"subu %[coeff], %[coeff], %[sign] \n\t" \
"addu %[coeff], %[coeff], %[temp1] \n\t" \
"slt %[temp4], %[temp2], %[coeff] \n\t" \
"addiu %[temp5], $zero, 0 \n\t" \
"addiu %[level], $zero, 0 \n\t" \
"beqz %[temp4], 2f \n\t" \
"lhu %[temp1], " #J "(%[ppiq]) \n\t" \
"lw %[temp2], " #K "(%[ppbias]) \n\t" \
"lhu %[temp3], " #J "(%[ppq]) \n\t" \
"mul %[level], %[coeff], %[temp1] \n\t" \
"addu %[level], %[level], %[temp2] \n\t" \
"sra %[level], %[level], 17 \n\t" \
"slt %[temp4], %[max_level], %[level] \n\t" \
"movn %[level], %[max_level], %[temp4] \n\t" \
"xor %[level], %[level], %[sign] \n\t" \
"subu %[level], %[level], %[sign] \n\t" \
"mul %[temp5], %[level], %[temp3] \n\t" \
"2: \n\t" \
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
int16_t* ppin = &in[0];
int16_t* pout = &out[0];
const uint16_t* ppsharpen = &mtx->sharpen_[0];
const uint32_t* ppzthresh = &mtx->zthresh_[0];
const uint16_t* ppq = &mtx->q_[0];
const uint16_t* ppiq = &mtx->iq_[0];
const uint32_t* ppbias = &mtx->bias_[0];
__asm__ volatile(
QUANTIZE_ONE( 0, 0, 0)
QUANTIZE_ONE( 2, 4, 2)
QUANTIZE_ONE( 8, 16, 4)
QUANTIZE_ONE(16, 32, 6)
QUANTIZE_ONE(10, 20, 8)
QUANTIZE_ONE( 4, 8, 10)
QUANTIZE_ONE( 6, 12, 12)
QUANTIZE_ONE(12, 24, 14)
QUANTIZE_ONE(18, 36, 16)
QUANTIZE_ONE(24, 48, 18)
QUANTIZE_ONE(26, 52, 20)
QUANTIZE_ONE(20, 40, 22)
QUANTIZE_ONE(14, 28, 24)
QUANTIZE_ONE(22, 44, 26)
QUANTIZE_ONE(28, 56, 28)
QUANTIZE_ONE(30, 60, 30)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[sign]"=&r"(sign), [coeff]"=&r"(coeff),
[level]"=&r"(level)
: [pout]"r"(pout), [ppin]"r"(ppin),
[ppiq]"r"(ppiq), [max_level]"r"(max_level),
[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
: "memory", "hi", "lo"
);
// moved out from macro to increase possibility for earlier breaking
for (i = 15; i >= 0; i--) {
if (out[i]) return 1;
}
return 0;
}
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#undef QUANTIZE_ONE
// macro for one horizontal pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// A - offset in bytes to load from a and b buffers
// E..H - offsets in bytes to store first results to tmp buffer
// E1..H1 - offsets in bytes to store second results to tmp buffer
#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
"lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"addu %[temp2], %[temp1], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp3] \n\t" \
"addu %[temp3], %[temp4], %[temp6] \n\t" \
"subu %[temp4], %[temp4], %[temp6] \n\t" \
"addu %[temp6], %[temp5], %[temp7] \n\t" \
"subu %[temp5], %[temp5], %[temp7] \n\t" \
"addu %[temp7], %[temp8], %[temp2] \n\t" \
"subu %[temp2], %[temp8], %[temp2] \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp3], %[temp6] \n\t" \
"subu %[temp3], %[temp3], %[temp6] \n\t" \
"addu %[temp6], %[temp4], %[temp5] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"sw %[temp7], " #E "(%[tmp]) \n\t" \
"sw %[temp2], " #H "(%[tmp]) \n\t" \
"sw %[temp8], " #F "(%[tmp]) \n\t" \
"sw %[temp0], " #G "(%[tmp]) \n\t" \
"sw %[temp1], " #E1 "(%[tmp]) \n\t" \
"sw %[temp3], " #H1 "(%[tmp]) \n\t" \
"sw %[temp6], " #F1 "(%[tmp]) \n\t" \
"sw %[temp4], " #G1 "(%[tmp]) \n\t"
// macro for one vertical pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// since only one accu is available in mips32r1 instruction set
// first is done second call of function TTransform and after
// that first one.
// const int sum1 = TTransform(a, w);
// const int sum2 = TTransform(b, w);
// return abs(sum2 - sum1) >> 5;
// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
// A..D - offsets in bytes to load first results from tmp buffer
// A1..D1 - offsets in bytes to load second results from tmp buffer
// E..H - offsets in bytes to load from w buffer
#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
"lw %[temp0], " #A1 "(%[tmp]) \n\t" \
"lw %[temp1], " #C1 "(%[tmp]) \n\t" \
"lw %[temp2], " #B1 "(%[tmp]) \n\t" \
"lw %[temp3], " #D1 "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp8], %[temp8], %[temp1] \n\t" \
"addu %[temp1], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp4], %[temp3], 31 \n\t" \
"sra %[temp5], %[temp1], 31 \n\t" \
"sra %[temp6], %[temp0], 31 \n\t" \
"sra %[temp7], %[temp8], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp4] \n\t" \
"xor %[temp1], %[temp1], %[temp5] \n\t" \
"xor %[temp0], %[temp0], %[temp6] \n\t" \
"xor %[temp8], %[temp8], %[temp7] \n\t" \
"subu %[temp3], %[temp3], %[temp4] \n\t" \
"subu %[temp1], %[temp1], %[temp5] \n\t" \
"subu %[temp0], %[temp0], %[temp6] \n\t" \
"subu %[temp8], %[temp8], %[temp7] \n\t" \
"lhu %[temp4], " #E "(%[w]) \n\t" \
"lhu %[temp5], " #F "(%[w]) \n\t" \
"lhu %[temp6], " #G "(%[w]) \n\t" \
"lhu %[temp7], " #H "(%[w]) \n\t" \
"madd %[temp4], %[temp3] \n\t" \
"madd %[temp5], %[temp1] \n\t" \
"madd %[temp6], %[temp0] \n\t" \
"madd %[temp7], %[temp8] \n\t" \
"lw %[temp0], " #A "(%[tmp]) \n\t" \
"lw %[temp1], " #C "(%[tmp]) \n\t" \
"lw %[temp2], " #B "(%[tmp]) \n\t" \
"lw %[temp3], " #D "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp1], %[temp8], %[temp1] \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp2], %[temp3], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp2] \n\t" \
"subu %[temp3], %[temp3], %[temp2] \n\t" \
"msub %[temp4], %[temp3] \n\t" \
"sra %[temp2], %[temp8], 31 \n\t" \
"sra %[temp3], %[temp0], 31 \n\t" \
"sra %[temp4], %[temp1], 31 \n\t" \
"xor %[temp8], %[temp8], %[temp2] \n\t" \
"xor %[temp0], %[temp0], %[temp3] \n\t" \
"xor %[temp1], %[temp1], %[temp4] \n\t" \
"subu %[temp8], %[temp8], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp4] \n\t" \
"msub %[temp5], %[temp8] \n\t" \
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
__asm__ volatile(
HORIZONTAL_PASS(0, 0, 4, 8, 12, 64, 68, 72, 76)
HORIZONTAL_PASS(1, 16, 20, 24, 28, 80, 84, 88, 92)
HORIZONTAL_PASS(2, 32, 36, 40, 44, 96, 100, 104, 108)
HORIZONTAL_PASS(3, 48, 52, 56, 60, 112, 116, 120, 124)
"mthi $zero \n\t"
"mtlo $zero \n\t"
VERTICAL_PASS( 0, 16, 32, 48, 64, 80, 96, 112, 0, 8, 16, 24)
VERTICAL_PASS( 4, 20, 36, 52, 68, 84, 100, 116, 2, 10, 18, 26)
VERTICAL_PASS( 8, 24, 40, 56, 72, 88, 104, 120, 4, 12, 20, 28)
VERTICAL_PASS(12, 28, 44, 60, 76, 92, 108, 124, 6, 14, 22, 30)
"mflo %[temp0] \n\t"
"sra %[temp1], %[temp0], 31 \n\t"
"xor %[temp0], %[temp0], %[temp1] \n\t"
"subu %[temp0], %[temp0], %[temp1] \n\t"
"sra %[temp0], %[temp0], 5 \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
: "memory", "hi", "lo"
);
return temp0;
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
}
}
return D;
}
// macro for one horizontal pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from src and ref buffers
// TEMP0..TEMP3 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
"lw %[" #TEMP1 "], 0(%[args]) \n\t" \
"lw %[" #TEMP2 "], 4(%[args]) \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[temp20], %[temp16], %[temp17] \n\t" \
"lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
"lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
"subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
"addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
"addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
"mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
"mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
"mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
"addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
"subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
"sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
"sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
"addiu %[temp16], %[temp16], 1812 \n\t" \
"addiu %[temp17], %[temp17], 937 \n\t" \
"addu %[temp16], %[temp16], %[temp19] \n\t" \
"subu %[temp17], %[temp17], %[temp18] \n\t" \
"sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
"sra %[" #TEMP3 "], %[temp17], 9 \n\t"
// macro for one vertical pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to store to out buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
"mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
"mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
"mul %[temp18], %[temp18], %[c5352] \n\t" \
"addiu %[temp16], %[temp16], 7 \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
"subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
"addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
"subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
"movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
"sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
const int c2217 = 2217;
const int c5352 = 5352;
const int* const args[3] =
{ (const int*)src, (const int*)ref, (const int*)out };
__asm__ volatile(
HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
"lw %[temp20], 8(%[args]) \n\t"
VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
: "memory", "hi", "lo"
);
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
#if !defined(WORK_AROUND_GCC)
#define GET_SSE_INNER(A, B, C, D) \
"lbu %[temp0], " #A "(%[a]) \n\t" \
"lbu %[temp1], " #A "(%[b]) \n\t" \
"lbu %[temp2], " #B "(%[a]) \n\t" \
"lbu %[temp3], " #B "(%[b]) \n\t" \
"lbu %[temp4], " #C "(%[a]) \n\t" \
"lbu %[temp5], " #C "(%[b]) \n\t" \
"lbu %[temp6], " #D "(%[a]) \n\t" \
"lbu %[temp7], " #D "(%[b]) \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"subu %[temp6], %[temp6], %[temp7] \n\t" \
"madd %[temp0], %[temp0] \n\t" \
"madd %[temp2], %[temp2] \n\t" \
"madd %[temp4], %[temp4] \n\t" \
"madd %[temp6], %[temp6] \n\t"
#define GET_SSE(A, B, C, D) \
GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
#undef GET_SSE
#undef GET_SSE_INNER
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
VP8ITransform = ITransform_MIPS32;
VP8FTransform = FTransform_MIPS32;
VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
VP8TDisto4x4 = Disto4x4_MIPS32;
VP8TDisto16x16 = Disto16x16_MIPS32;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16_MIPS32;
VP8SSE8x8 = SSE8x8_MIPS32;
VP8SSE16x8 = SSE16x8_MIPS32;
VP8SSE4x4 = SSE4x4_MIPS32;
#endif
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,896 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA version of encoder dsp functions.
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include <stdlib.h>
#include "src/dsp/msa_macro.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms
#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v4i32 a1_m, b1_m, c1_m, d1_m; \
const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
\
ADDSUB2(in0, in2, a1_m, b1_m); \
SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
c_tmp2_m = c_tmp2_m + in3; \
c1_m = c_tmp1_m - c_tmp2_m; \
SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
d_tmp1_m = d_tmp1_m + in1; \
d1_m = d_tmp1_m + d_tmp2_m; \
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} while (0)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
v16i8 dest0, dest1, dest2, dest3;
const v16i8 zero = { 0 };
LD_SH2(in, 8, input0, input1);
UNPCK_SH_SW(input0, in0, in1);
UNPCK_SH_SW(input1, in2, in3);
IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
res0, res1, res2, res3);
ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
res0, res1, res2, res3);
ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
CLIP_SW4_0_255(res0, res1, res2, res3);
PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
LW4(src, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src0);
LW4(ref, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src1);
ILVRL_B2_UB(src0, src1, srcl0, srcl1);
HSUB_UB2_SH(srcl0, srcl1, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
t0 = SRLI_H(t0, 3);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
FILL_W2_SW(1812, 937, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 9);
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
SRAI_W2_SW(tmp0, tmp2, 4);
FILL_W2_SW(12000, 51000, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 16);
UNPCK_R_SH_SW(t1, tmp4);
tmp5 = __msa_ceqi_w(tmp4, 0);
tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
tmp5 = __msa_fill_w(1);
tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
tmp1 += tmp5;
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
out0 = __msa_copy_s_d((v2i64)t0, 0);
out1 = __msa_copy_s_d((v2i64)t0, 1);
out2 = __msa_copy_s_d((v2i64)t1, 0);
out3 = __msa_copy_s_d((v2i64)t1, 1);
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
v8i16 out0, out1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
in0 = __msa_insert_h(in0, 0, in[ 0]);
in0 = __msa_insert_h(in0, 1, in[ 64]);
in0 = __msa_insert_h(in0, 2, in[128]);
in0 = __msa_insert_h(in0, 3, in[192]);
in0 = __msa_insert_h(in0, 4, in[ 16]);
in0 = __msa_insert_h(in0, 5, in[ 80]);
in0 = __msa_insert_h(in0, 6, in[144]);
in0 = __msa_insert_h(in0, 7, in[208]);
in1 = __msa_insert_h(in1, 0, in[ 48]);
in1 = __msa_insert_h(in1, 1, in[112]);
in1 = __msa_insert_h(in1, 2, in[176]);
in1 = __msa_insert_h(in1, 3, in[240]);
in1 = __msa_insert_h(in1, 4, in[ 32]);
in1 = __msa_insert_h(in1, 5, in[ 96]);
in1 = __msa_insert_h(in1, 6, in[160]);
in1 = __msa_insert_h(in1, 7, in[224]);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, out0, out1);
SRAI_H2_SH(out0, out1, 1);
ST_SH2(out0, out1, out, 8);
}
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0 = { 0 };
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
v4i32 dst0, dst1;
const v16i8 zero = { 0 };
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
ILVRL_B2_SH(zero, src0, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
LD_SH2(w, 8, tmp2, tmp3);
DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
dst0 = dst0 + dst1;
sum = HADD_SW_S32(dst0);
return sum;
}
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_MSA(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Histogram
static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
v8i16 coeff0, coeff1;
const v8i16 zero = { 0 };
const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
LD_SH2(&out[0], 8, coeff0, coeff1);
coeff0 = __msa_add_a_h(coeff0, zero);
coeff1 = __msa_add_a_h(coeff1, zero);
SRAI_H2_SH(coeff0, coeff1, 3);
coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
ST_SH2(coeff0, coeff1, &out[0], 8);
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
// Intra predictions
// luma 4x4 prediction
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R = __msa_aver_u_b(AC, B2);
const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
SW4(out, out, out, out, dst, BPS);
}
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
dc >>= 3;
dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
SW4(dc, dc, dc, dc, dst, BPS);
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R0 = __msa_aver_u_b(AC, B2);
const v16u8 R1 = SLDI_UB(R0, R0, 1);
const v16u8 R2 = SLDI_UB(R1, R1, 1);
const v16u8 R3 = SLDI_UB(R2, R2, 1);
const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
SW4(val3, val2, val1, val0, dst, BPS);
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C1 = SLDI_UB(A, A, 2);
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R0 = __msa_aver_u_b(AC, B2);
const v16u8 R1 = SLDI_UB(R0, R0, 1);
const v16u8 R2 = SLDI_UB(R1, R1, 1);
const v16u8 R3 = SLDI_UB(R2, R2, 1);
const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
SW4(val0, val1, val2, val3, dst, BPS);
}
static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
const v16u8 T1 = LD_UB(top);
const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
const v8i16 d = T - TL;
v8i16 r0, r1, r2, r3;
ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
CLIP_SH4_0_255(r0, r1, r2, r3);
PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
}
#undef DST
#undef AVG3
#undef AVG2
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
// luma 16x16 prediction
#define STORE16x16(out, dst) do { \
ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
const v16u8 out = LD_UB(top);
STORE16x16(out, dst);
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x7f);
STORE16x16(out, dst);
}
}
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
const uint8_t* left) {
if (left != NULL) {
int j;
for (j = 0; j < 16; j += 4) {
const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
ST_UB4(L0, L1, L2, L3, dst, BPS);
dst += 4 * BPS;
left += 4;
}
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x81);
STORE16x16(out, dst);
}
}
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (left != NULL) {
if (top != NULL) {
int j;
v8i16 d1, d2;
const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
const v16u8 T = LD_UB(top);
ILVRL_B2_SH(zero, T, d1, d2);
SUB2(d1, TL, d2, TL, d1, d2);
for (j = 0; j < 16; j += 4) {
v16i8 t0, t1, t2, t3;
v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
CLIP_SH4_0_255(r0, r1, r2, r3);
CLIP_SH4_0_255(r4, r5, r6, r7);
PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
ST_SB4(t0, t1, t2, t3, dst, BPS);
dst += 4 * BPS;
}
} else {
HorizontalPred16x16(dst, left);
}
} else {
if (top != NULL) {
VerticalPred16x16(dst, top);
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x81);
STORE16x16(out, dst);
}
}
}
static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
int DC;
v16u8 out;
if (top != NULL && left != NULL) {
const v16u8 rtop = LD_UB(top);
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
const v16u8 rleft = LD_UB(left);
const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
const v8u16 dctemp = dctop + dcleft;
DC = HADD_UH_U32(dctemp);
DC = (DC + 16) >> 5;
} else if (left != NULL) { // left but no top
const v16u8 rleft = LD_UB(left);
const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
DC = HADD_UH_U32(dcleft);
DC = (DC + DC + 16) >> 5;
} else if (top != NULL) { // top but no left
const v16u8 rtop = LD_UB(top);
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
DC = HADD_UH_U32(dctop);
DC = (DC + DC + 16) >> 5;
} else { // no top, no left, nothing.
DC = 0x80;
}
out = (v16u8)__msa_fill_b(DC);
STORE16x16(out, dst);
}
static void Intra16Preds_MSA(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
TrueMotion16x16(I16TM16 + dst, left, top);
}
// Chroma 8x8 prediction
#define CALC_DC8(in, out) do { \
const v8u16 temp0 = __msa_hadd_u_h(in, in); \
const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
const v2i64 temp3 = __msa_splati_d(temp2, 1); \
const v2i64 temp4 = temp3 + temp2; \
const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
out = __msa_copy_s_d(temp6, 0); \
} while (0)
#define STORE8x8(out, dst) do { \
SD4(out, out, out, out, dst + 0 * BPS, BPS); \
SD4(out, out, out, out, dst + 4 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
const uint64_t out = LD(top);
STORE8x8(out, dst);
} else {
const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
STORE8x8(out, dst);
}
}
static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
if (left != NULL) {
int j;
for (j = 0; j < 8; j += 4) {
const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
SD4(out0, out1, out2, out3, dst, BPS);
dst += 4 * BPS;
left += 4;
}
} else {
const uint64_t out = 0x8181818181818181ULL;
STORE8x8(out, dst);
}
}
static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (left != NULL) {
if (top != NULL) {
int j;
const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
const v16u8 T1 = LD_UB(top);
const v16i8 zero = { 0 };
const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
const v8i16 d = T - TL;
for (j = 0; j < 8; j += 4) {
uint64_t out0, out1, out2, out3;
v16i8 t0, t1;
v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
CLIP_SH4_0_255(r0, r1, r2, r3);
PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
out0 = __msa_copy_s_d((v2i64)t0, 0);
out1 = __msa_copy_s_d((v2i64)t0, 1);
out2 = __msa_copy_s_d((v2i64)t1, 0);
out3 = __msa_copy_s_d((v2i64)t1, 1);
SD4(out0, out1, out2, out3, dst, BPS);
dst += 4 * BPS;
}
} else {
HorizontalPred8x8(dst, left);
}
} else {
if (top != NULL) {
VerticalPred8x8(dst, top);
} else {
const uint64_t out = 0x8181818181818181ULL;
STORE8x8(out, dst);
}
}
}
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint64_t out;
v16u8 src = { 0 };
if (top != NULL && left != NULL) {
const uint64_t left_m = LD(left);
const uint64_t top_m = LD(top);
INSERT_D2_UB(left_m, top_m, src);
CALC_DC8(src, out);
} else if (left != NULL) { // left but no top
const uint64_t left_m = LD(left);
INSERT_D2_UB(left_m, left_m, src);
CALC_DC8(src, out);
} else if (top != NULL) { // top but no left
const uint64_t top_m = LD(top);
INSERT_D2_UB(top_m, top_m, src);
CALC_DC8(src, out);
} else { // no top, no left, nothing.
src = (v16u8)__msa_fill_b(0x80);
out = __msa_copy_s_d((v2i64)src, 0);
}
STORE8x8(out, dst);
}
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
HorizontalPred8x8(C8HE8 + dst, left);
TrueMotion8x8(C8TM8 + dst, left, top);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
HorizontalPred8x8(C8HE8 + dst, left);
TrueMotion8x8(C8TM8 + dst, left, top);
}
//------------------------------------------------------------------------------
// Metric
#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v16u8 tmp0, tmp1; \
v8i16 tmp2, tmp3; \
ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v16u8 tmp0, tmp1; \
v8i16 tmp2, tmp3; \
ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
a += 8 * BPS;
b += 8 * BPS;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v16u8 t0, t1, t2, t3;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
v8i16 diff0, diff1;
v4i32 out0, out1;
LW4(a, BPS, src0, src1, src2, src3);
LW4(b, BPS, ref0, ref1, ref2, ref3);
INSERT_W4_UB(src0, src1, src2, src3, src);
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
ILVRL_B2_UB(src, ref, tmp0, tmp1);
HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
out0 += out1;
sum = HADD_SW_S32(out0);
return sum;
}
//------------------------------------------------------------------------------
// Quantization
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
const v8i16 zero = { 0 };
const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
LD_SH2(&in[0], 8, in0, in1);
LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
tmp4 = __msa_add_a_h(in0, zero);
tmp5 = __msa_add_a_h(in1, zero);
ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
sign0 = (in0 < zero);
sign1 = (in1 < zero); // sign
LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq
ILVRL_H2_SW(zero, tmp0, t0, t1);
ILVRL_H2_SW(zero, tmp1, t2, t3);
LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias
MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
SRAI_W4_SW(b0, b1, b2, b3, 17);
PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
tmp0 = (tmp2 > maxlevel);
tmp1 = (tmp3 > maxlevel);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
t0 = (s0 > t0);
t1 = (s1 > t1);
t2 = (s2 > t2);
t3 = (s3 > t3);
PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
ST_SH2(in0, in1, &in[0], 8);
ST_SH2(out0, out1, &out[0], 8);
out0 = __msa_add_a_h(out0, out1);
sum = HADD_SH_S32(out0);
return (sum > 0);
}
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
VP8ITransform = ITransform_MSA;
VP8FTransform = FTransform_MSA;
VP8FTransformWHT = FTransformWHT_MSA;
VP8TDisto4x4 = Disto4x4_MSA;
VP8TDisto16x16 = Disto16x16_MSA;
VP8CollectHistogram = CollectHistogram_MSA;
VP8EncPredLuma4 = Intra4Preds_MSA;
VP8EncPredLuma16 = Intra16Preds_MSA;
VP8EncPredChroma8 = IntraChromaPreds_MSA;
VP8SSE16x16 = SSE16x16_MSA;
VP8SSE16x8 = SSE16x8_MSA;
VP8SSE8x8 = SSE8x8_MSA;
VP8SSE4x4 = SSE4x4_MSA;
VP8EncQuantizeBlock = QuantizeBlock_MSA;
VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,938 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of speed-critical encoding functions.
//
// adapted from libvpx (http://www.webmproject.org/code/)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "src/dsp/neon.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Inverse transform.
// This code is pretty much the same as TransformOne in the dec_neon.c, except
// for subtraction to *ref. See the comments there for algorithmic explanations.
static const int16_t kC1 = 20091;
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
// This code works but is *slower* than the inlined-asm version below
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
// WEBP_USE_INTRINSICS define.
// With gcc-4.8, it's a little faster speed than inlined-assembly.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
// Store the results.
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
const uint8_t* const ref,
uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
// Load the source pixels.
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
{
// Convert to 16b.
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
const int16x8_t in1,
int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
// b0 d0 b1 d1 b2 d2 ...
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
const int16x8_t B1 =
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
// C0 = kC1 * in4 | kC1 * in12
// C1 = kC2 * in4 | kC2 * in12
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 + in8
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 - in8
// c = kC2 * in4 - kC1 * in12
// d = kC1 * in4 + kC2 * in12
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2_NEON(E0, E1, rows);
}
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
TransformPass_NEON(&rows);
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
}
#else
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
__asm__ volatile (
"vld1.16 {q1, q2}, [%[in]] \n"
"vld1.16 {d0}, [%[kC1C2]] \n"
// d2: in[0]
// d3: in[8]
// d4: in[4]
// d5: in[12]
"vswp d3, d4 \n"
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
// q9 = {in[4], in[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = in[0] + in[8]
// d23 = b = in[0] - in[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
// q8 = in[4]/[12] * kC1 >> 16
"vshr.s16 q8, q8, #1 \n"
// Add {in[4], in[12]} back after the multiplication.
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
"vswp d3, d4 \n"
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = tmp[0] + tmp[8]
// d23 = b = tmp[0] - tmp[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
"vshr.s16 q8, q8, #1 \n"
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
// (val) + 4 >> 3
"vrshr.s16 d2, d2, #3 \n"
"vrshr.s16 d3, d3, #3 \n"
"vrshr.s16 d4, d4, #3 \n"
"vrshr.s16 d5, d5, #3 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
// Must accumulate before saturating
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vqadd.s16 q1, q1, q8 \n"
"vqadd.s16 q2, q2, q9 \n"
"vqmovun.s16 d0, q1 \n"
"vqmovun.s16 d1, q2 \n"
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
"vst1.32 d1[1], [%[dst]] \n"
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
);
}
#endif // WEBP_USE_INTRINSICS
static void ITransform_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne_NEON(ref, in, dst);
if (do_two) {
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
return vreinterpretq_u8_u32(out);
}
// Forward transform.
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
const int16x4_t B,
const int16x4_t C,
const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
vreinterpret_s32_s16(CD.val[0]));
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
vreinterpret_s32_s16(CD.val[1]));
*out01 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
vreinterpret_s64_s32(tmp13.val[0])));
*out32 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
vreinterpret_s64_s32(tmp02.val[1])));
}
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4_NEON(src);
const uint8x16_t R0 = Load4x4_NEON(ref);
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
const int32x4_t kCst1812 = vdupq_n_s32(1812);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
const int32x4_t kCst51000 = vdupq_n_s32(51000);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
const int16x4_t a3_eq_0 =
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#else
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
static const int16_t kCoeff16[] = {
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
};
static const int32_t kCoeff32[] = {
1812, 1812, 1812, 1812,
937, 937, 937, 937,
12000, 12000, 12000, 12000,
51000, 51000, 51000, 51000
};
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
const int16_t* coeff16 = kCoeff16;
const int32_t* coeff32 = kCoeff32;
__asm__ volatile (
// load src into q4, q5 in high half
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d11}, [%[src_ptr]] \n"
// load ref into q6, q7 in high half
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d15}, [%[ref_ptr]] \n"
// Pack the high values in to q4 and q6
"vtrn.32 q4, q5 \n"
"vtrn.32 q6, q7 \n"
// d[0-3] = src - ref
"vsubl.u8 q0, d8, d12 \n"
"vsubl.u8 q1, d9, d13 \n"
// load coeff16 into q8(d16=5352, d17=2217)
"vld1.16 {q8}, [%[coeff16]] \n"
// load coeff32 high half into q9 = 1812, q10 = 937
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
// load coeff32 low half into q11=12000, q12=51000
"vld1.32 {q11,q12}, [%[coeff32]] \n"
// part 1
// Transpose. Register dN is the same as dN in C
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
"vadd.s16 d0, d4, d5 \n" // a0 + a1
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
"vsub.s16 d2, d4, d5 \n" // a0 - a1
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
"vshrn.s32 d1, q9, #9 \n"
"vshrn.s32 d3, q10, #9 \n"
// part 2
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vmov.s16 d26, #7 \n"
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
"vadd.s16 d4, d4, d26 \n" // a1 + 7
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
"vceq.s16 d4, d7, #0 \n"
"vshr.s16 d0, d0, #4 \n"
"vshr.s16 d2, d2, #4 \n"
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
"vmvn d4, d4 \n" // !(d1 == 0)
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
"vshrn.s32 d1, q11, #16 \n"
// op[4] += (d1!=0)
"vsub.s16 d1, d1, d4 \n"
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
"vshrn.s32 d3, q12, #16 \n"
// set result to out array
"vst1.16 {q0, q1}, [%[out]] \n"
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
[coeff32] "+r"(coeff32) // modified registers
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
[out] "r"(out) // constants
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13" // clobbered
);
}
#endif
#define LOAD_LANE_16b(VALUE, LANE) do { \
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
src += stride; \
} while (0)
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
int16x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
LOAD_LANE_16b(in.val[0], 0);
LOAD_LANE_16b(in.val[1], 0);
LOAD_LANE_16b(in.val[2], 0);
LOAD_LANE_16b(in.val[3], 0);
LOAD_LANE_16b(in.val[0], 1);
LOAD_LANE_16b(in.val[1], 1);
LOAD_LANE_16b(in.val[2], 1);
LOAD_LANE_16b(in.val[3], 1);
LOAD_LANE_16b(in.val[0], 2);
LOAD_LANE_16b(in.val[1], 2);
LOAD_LANE_16b(in.val[2], 2);
LOAD_LANE_16b(in.val[3], 2);
LOAD_LANE_16b(in.val[0], 3);
LOAD_LANE_16b(in.val[1], 3);
LOAD_LANE_16b(in.val[2], 3);
LOAD_LANE_16b(in.val[3], 3);
{
// a0 = in[0 * 16] + in[2 * 16]
// a1 = in[1 * 16] + in[3 * 16]
// a2 = in[1 * 16] - in[3 * 16]
// a3 = in[0 * 16] - in[2 * 16]
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
tmp0.val[0] = vaddq_s32(a0, a1);
tmp0.val[1] = vaddq_s32(a3, a2);
tmp0.val[2] = vsubq_s32(a3, a2);
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
// a3 = tmp[0 + i] - tmp[ 8 + i]
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
const int16x4_t out0 = vmovn_s32(b0);
const int16x4_t out1 = vmovn_s32(b1);
const int16x4_t out2 = vmovn_s32(b2);
const int16x4_t out3 = vmovn_s32(b3);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#undef LOAD_LANE_16b
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// a 0123, b 0123
// a 4567, b 4567
// a 89ab, b 89ab
// a cdef, b cdef
//
// transpose
//
// a 048c, b 048c
// a 159d, b 159d
// a 26ae, b 26ae
// a 37bf, b 37bf
//
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
vreinterpretq_s32_s16(q2_tmp1.val[0]));
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
vreinterpretq_s32_s16(q2_tmp1.val[1]));
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
return q4_in;
}
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
const int16x8x4_t q4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
int16x8x4_t q4_out;
// tmp[0] = a0 + a1
// tmp[1] = a3 + a2
// tmp[2] = a3 - a2
// tmp[3] = a0 - a1
INIT_VECTOR4(q4_out,
vabsq_s16(vaddq_s16(q_a0, q_a1)),
vabsq_s16(vaddq_s16(q_a3, q_a2)),
vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
return q4_out;
}
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
q4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
q4_in.val[3]));
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
q4_in.val[3]));
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
q4_in.val[2]));
int16x8x4_t q4_out;
INIT_VECTOR4(q4_out,
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
return q4_out;
}
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
INIT_VECTOR4(d4_w,
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
return d4_w;
}
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
// sum += w[ 8] * abs(b2);
// sum += w[12] * abs(b3);
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
q_sum0 = vaddq_s32(q_sum0, q_sum1);
q_sum2 = vaddq_s32(q_sum2, q_sum3);
q_sum2 = vaddq_s32(q_sum0, q_sum2);
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
d_sum = vpadd_s32(d_sum, d_sum);
return d_sum;
}
#define LOAD_LANE_32b(src, VALUE, LANE) \
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
uint8x8x4_t d4_in;
// load data a, b
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
INIT_VECTOR4(d4_in,
vreinterpret_u8_u32(d_in_ab_0123),
vreinterpret_u8_u32(d_in_ab_4567),
vreinterpret_u8_u32(d_in_ab_89ab),
vreinterpret_u8_u32(d_in_ab_cdef));
{
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent
// transpose.
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
// horizontal pass
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
d_sum = vshr_n_s32(d_sum, 5);
return vget_lane_s32(d_sum, 0);
}
}
#undef LOAD_LANE_32b
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_NEON(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
const int16x8_t b0 = vld1q_s16(out + 8);
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
vget_high_u8(abs_diff));
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
}
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt_NEON(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
}
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
const uint8x8_t a0 = vld1_u8(a + y * BPS);
const uint8x8_t b0 = vld1_u8(b + y * BPS);
const uint8x8_t abs_diff = vabd_u8(a0, b0);
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
vget_high_u8(abs_diff));
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
return SumToInt_NEON(vaddq_u32(sum1, sum2));
}
//------------------------------------------------------------------------------
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize_NEON(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
const int16x8_t a = vld1q_s16(in + offset); // in
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
vst1q_s16(in + offset, c4);
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
return c3;
}
static const uint8_t kShuffles[4][8] = {
{ 0, 1, 2, 3, 8, 9, 16, 17 },
{ 10, 11, 4, 5, 6, 7, 12, 13 },
{ 18, 19, 24, 25, 26, 27, 20, 21 },
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
uint8x16x2_t all_out;
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
INIT_VECTOR4(shuffles,
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
#else
uint8x8x4_t all_out;
INIT_VECTOR4(all_out,
vreinterpret_u8_s16(vget_low_s16(out0)),
vreinterpret_u8_s16(vget_high_s16(out0)),
vreinterpret_u8_s16(vget_low_s16(out1)),
vreinterpret_u8_s16(vget_high_s16(out1)));
INIT_VECTOR4(shuffles,
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
#endif
// Zigzag reordering
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
// test zeros
if (*(uint64_t*)(out + 0) != 0) return 1;
if (*(uint64_t*)(out + 4) != 0) return 1;
if (*(uint64_t*)(out + 8) != 0) return 1;
if (*(uint64_t*)(out + 12) != 0) return 1;
return 0;
}
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8ITransform = ITransform_NEON;
VP8FTransform = FTransform_NEON;
VP8FTransformWHT = FTransformWHT_NEON;
VP8TDisto4x4 = Disto4x4_NEON;
VP8TDisto16x16 = Disto16x16_NEON;
VP8CollectHistogram = CollectHistogram_NEON;
VP8SSE16x16 = SSE16x16_NEON;
VP8SSE16x8 = SSE16x8_NEON;
VP8SSE8x8 = SSE8x8_NEON;
VP8SSE4x4 = SSE4x4_NEON;
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
#endif
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
#endif // WEBP_USE_NEON

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 version of some encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
#include "src/dsp/common_sse2.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
int k;
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
// v = abs(out) >> 3
const __m128i abs0 = _mm_abs_epi16(out0);
const __m128i abs1 = _mm_abs_epi16(out1);
const __m128i v0 = _mm_srai_epi16(abs0, 3);
const __m128i v1 = _mm_srai_epi16(abs1, 3);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i*)&out[0], bin0);
_mm_storeu_si128((__m128i*)&out[8], bin1);
}
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
// Load and combine inputs.
{
const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
// In SSE4.1, with gcc 4.8 at least (maybe other versions),
// _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
// of inA and inB, _mm_loadl_epi64 is still used not to have an out of
// bound read.
const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
tmp_0 = _mm_cvtepu8_epi16(inAB_0);
tmp_1 = _mm_cvtepu8_epi16(inAB_1);
tmp_2 = _mm_cvtepu8_epi16(inAB_2);
tmp_3 = _mm_cvtepu8_epi16(inAB_3);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
}
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
}
// Horizontal pass and difference of weighted sums.
{
// Load all inputs.
const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// Separate the transforms of inA and inB.
__m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
__m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
__m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
A_b0 = _mm_abs_epi16(A_b0);
A_b2 = _mm_abs_epi16(A_b2);
B_b0 = _mm_abs_epi16(B_b0);
B_b2 = _mm_abs_epi16(B_b2);
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
B_b0 = _mm_madd_epi16(B_b0, w_0);
B_b2 = _mm_madd_epi16(B_b2, w_8);
A_b0 = _mm_add_epi32(A_b0, A_b2);
B_b0 = _mm_add_epi32(B_b0, B_b2);
// difference of weighted sums
A_b2 = _mm_sub_epi32(A_b0, B_b0);
_mm_storeu_si128((__m128i*)&sum[0], A_b2);
}
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_SSE41(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
// Generates a pshufb constant for shuffling 16b words.
#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
_mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
// coeff = abs(in)
__m128i coeff0 = _mm_abs_epi16(in0);
__m128i coeff8 = _mm_abs_epi16(in8);
// coeff = abs(in) + sharpen
if (sharpen != NULL) {
const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
// out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// out = (coeff * iQ + B)
const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
// if (coeff > 2047) coeff = 2047
out0 = _mm_min_epi16(out0, max_coeff_2047);
out8 = _mm_min_epi16(out8, max_coeff_2047);
}
// put sign back
out0 = _mm_sign_epi16(out0, in0);
out8 = _mm_sign_epi16(out8, in8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
// zigzag the output before storing it. The re-ordering is:
// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
// -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
// There's only two misplaced entries ([8] and [7]) that are crossing the
// reg's boundaries.
// We use pshufb instead of pshuflo/pshufhi.
{
const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7
const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8
const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
_mm_storeu_si128((__m128i*)&out[0], out_z0);
_mm_storeu_si128((__m128i*)&out[8], out_z8);
packed_out = _mm_packs_epi16(out_z0, out_z8);
}
// detect if all 'out' values are zeroes or not
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
#undef PSHUFB_CST
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
VP8CollectHistogram = CollectHistogram_SSE41;
VP8EncQuantizeBlock = QuantizeBlock_SSE41;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
VP8TDisto4x4 = Disto4x4_SSE41;
VP8TDisto16x16 = Disto16x16_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
#endif // WEBP_USE_SSE41

View File

@ -0,0 +1,948 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "src/dsp/dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.0000000000000000f, 0.0000000000000000f,
1.0000000000000000f, 1.5849625007211560f,
2.0000000000000000f, 2.3219280948873621f,
2.5849625007211560f, 2.8073549220576041f,
3.0000000000000000f, 3.1699250014423121f,
3.3219280948873621f, 3.4594316186372973f,
3.5849625007211560f, 3.7004397181410921f,
3.8073549220576041f, 3.9068905956085187f,
4.0000000000000000f, 4.0874628412503390f,
4.1699250014423121f, 4.2479275134435852f,
4.3219280948873626f, 4.3923174227787606f,
4.4594316186372973f, 4.5235619560570130f,
4.5849625007211560f, 4.6438561897747243f,
4.7004397181410917f, 4.7548875021634682f,
4.8073549220576037f, 4.8579809951275718f,
4.9068905956085187f, 4.9541963103868749f,
5.0000000000000000f, 5.0443941193584533f,
5.0874628412503390f, 5.1292830169449663f,
5.1699250014423121f, 5.2094533656289501f,
5.2479275134435852f, 5.2854022188622487f,
5.3219280948873626f, 5.3575520046180837f,
5.3923174227787606f, 5.4262647547020979f,
5.4594316186372973f, 5.4918530963296747f,
5.5235619560570130f, 5.5545888516776376f,
5.5849625007211560f, 5.6147098441152083f,
5.6438561897747243f, 5.6724253419714951f,
5.7004397181410917f, 5.7279204545631987f,
5.7548875021634682f, 5.7813597135246599f,
5.8073549220576037f, 5.8328900141647412f,
5.8579809951275718f, 5.8826430493618415f,
5.9068905956085187f, 5.9307373375628866f,
5.9541963103868749f, 5.9772799234999167f,
6.0000000000000000f, 6.0223678130284543f,
6.0443941193584533f, 6.0660891904577720f,
6.0874628412503390f, 6.1085244567781691f,
6.1292830169449663f, 6.1497471195046822f,
6.1699250014423121f, 6.1898245588800175f,
6.2094533656289501f, 6.2288186904958804f,
6.2479275134435852f, 6.2667865406949010f,
6.2854022188622487f, 6.3037807481771030f,
6.3219280948873626f, 6.3398500028846243f,
6.3575520046180837f, 6.3750394313469245f,
6.3923174227787606f, 6.4093909361377017f,
6.4262647547020979f, 6.4429434958487279f,
6.4594316186372973f, 6.4757334309663976f,
6.4918530963296747f, 6.5077946401986963f,
6.5235619560570130f, 6.5391588111080309f,
6.5545888516776376f, 6.5698556083309478f,
6.5849625007211560f, 6.5999128421871278f,
6.6147098441152083f, 6.6293566200796094f,
6.6438561897747243f, 6.6582114827517946f,
6.6724253419714951f, 6.6865005271832185f,
6.7004397181410917f, 6.7142455176661224f,
6.7279204545631987f, 6.7414669864011464f,
6.7548875021634682f, 6.7681843247769259f,
6.7813597135246599f, 6.7944158663501061f,
6.8073549220576037f, 6.8201789624151878f,
6.8328900141647412f, 6.8454900509443747f,
6.8579809951275718f, 6.8703647195834047f,
6.8826430493618415f, 6.8948177633079437f,
6.9068905956085187f, 6.9188632372745946f,
6.9307373375628866f, 6.9425145053392398f,
6.9541963103868749f, 6.9657842846620869f,
6.9772799234999167f, 6.9886846867721654f,
7.0000000000000000f, 7.0112272554232539f,
7.0223678130284543f, 7.0334230015374501f,
7.0443941193584533f, 7.0552824355011898f,
7.0660891904577720f, 7.0768155970508308f,
7.0874628412503390f, 7.0980320829605263f,
7.1085244567781691f, 7.1189410727235076f,
7.1292830169449663f, 7.1395513523987936f,
7.1497471195046822f, 7.1598713367783890f,
7.1699250014423121f, 7.1799090900149344f,
7.1898245588800175f, 7.1996723448363644f,
7.2094533656289501f, 7.2191685204621611f,
7.2288186904958804f, 7.2384047393250785f,
7.2479275134435852f, 7.2573878426926521f,
7.2667865406949010f, 7.2761244052742375f,
7.2854022188622487f, 7.2946207488916270f,
7.3037807481771030f, 7.3128829552843557f,
7.3219280948873626f, 7.3309168781146167f,
7.3398500028846243f, 7.3487281542310771f,
7.3575520046180837f, 7.3663222142458160f,
7.3750394313469245f, 7.3837042924740519f,
7.3923174227787606f, 7.4008794362821843f,
7.4093909361377017f, 7.4178525148858982f,
7.4262647547020979f, 7.4346282276367245f,
7.4429434958487279f, 7.4512111118323289f,
7.4594316186372973f, 7.4676055500829976f,
7.4757334309663976f, 7.4838157772642563f,
7.4918530963296747f, 7.4998458870832056f,
7.5077946401986963f, 7.5156998382840427f,
7.5235619560570130f, 7.5313814605163118f,
7.5391588111080309f, 7.5468944598876364f,
7.5545888516776376f, 7.5622424242210728f,
7.5698556083309478f, 7.5774288280357486f,
7.5849625007211560f, 7.5924570372680806f,
7.5999128421871278f, 7.6073303137496104f,
7.6147098441152083f, 7.6220518194563764f,
7.6293566200796094f, 7.6366246205436487f,
7.6438561897747243f, 7.6510516911789281f,
7.6582114827517946f, 7.6653359171851764f,
7.6724253419714951f, 7.6794800995054464f,
7.6865005271832185f, 7.6934869574993252f,
7.7004397181410917f, 7.7073591320808825f,
7.7142455176661224f, 7.7210991887071855f,
7.7279204545631987f, 7.7347096202258383f,
7.7414669864011464f, 7.7481928495894605f,
7.7548875021634682f, 7.7615512324444795f,
7.7681843247769259f, 7.7747870596011736f,
7.7813597135246599f, 7.7879025593914317f,
7.7944158663501061f, 7.8008998999203047f,
7.8073549220576037f, 7.8137811912170374f,
7.8201789624151878f, 7.8265484872909150f,
7.8328900141647412f, 7.8392037880969436f,
7.8454900509443747f, 7.8517490414160571f,
7.8579809951275718f, 7.8641861446542797f,
7.8703647195834047f, 7.8765169465649993f,
7.8826430493618415f, 7.8887432488982591f,
7.8948177633079437f, 7.9008668079807486f,
7.9068905956085187f, 7.9128893362299619f,
7.9188632372745946f, 7.9248125036057812f,
7.9307373375628866f, 7.9366379390025709f,
7.9425145053392398f, 7.9483672315846778f,
7.9541963103868749f, 7.9600019320680805f,
7.9657842846620869f, 7.9715435539507719f,
7.9772799234999167f, 7.9829935746943103f,
7.9886846867721654f, 7.9943534368588577f
};
const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.00000000f, 0.00000000f, 2.00000000f, 4.75488750f,
8.00000000f, 11.60964047f, 15.50977500f, 19.65148445f,
24.00000000f, 28.52932501f, 33.21928095f, 38.05374781f,
43.01955001f, 48.10571634f, 53.30296891f, 58.60335893f,
64.00000000f, 69.48686830f, 75.05865003f, 80.71062276f,
86.43856190f, 92.23866588f, 98.10749561f, 104.04192499f,
110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
};
const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
{ 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
{ 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
{ 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
{ 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
{ 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
{10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
{10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
{11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
{11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
};
const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
127,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
v >>= log_cnt;
#else
int log_cnt = 0;
uint32_t y = 1;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
do {
++log_cnt;
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
#endif
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
correction = (23 * (orig_v & (y - 1))) >> 4;
return v_f * (kLog2Table[v] + log_cnt) + correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
}
}
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
const uint32_t orig_v = v;
double log_2;
v >>= log_cnt;
#else
int log_cnt = 0;
uint32_t y = 1;
const uint32_t orig_v = v;
double log_2;
do {
++log_cnt;
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
#endif
log_2 = kLog2Table[v] + log_cnt;
if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const int correction = (23 * (orig_v & (y - 1))) >> 4;
log_2 += (double)correction / orig_v;
}
return (float)log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
}
}
//------------------------------------------------------------------------------
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
for (i = 0; i < 256; ++i) {
const int x = X[i];
if (x != 0) {
const int xy = x + Y[i];
sumX += x;
retval -= VP8LFastSLog2(x);
sumXY += xy;
retval -= VP8LFastSLog2(xy);
} else if (Y[i] != 0) {
sumXY += Y[i];
retval -= VP8LFastSLog2(Y[i]);
}
}
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
entropy->entropy = 0.;
entropy->sum = 0;
entropy->nonzeros = 0;
entropy->max_val = 0;
entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
}
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
VP8LBitEntropy* const entropy) {
int i;
VP8LBitEntropyInit(entropy);
for (i = 0; i < n; ++i) {
if (array[i] != 0) {
entropy->sum += array[i];
entropy->nonzero_code = i;
++entropy->nonzeros;
entropy->entropy -= VP8LFastSLog2(array[i]);
if (entropy->max_val < array[i]) {
entropy->max_val = array[i];
}
}
}
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
const int streak = i - *i_prev;
// Gather info for the bit entropy.
if (*val_prev != 0) {
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
}
// Gather info for the Huffman cost.
stats->counts[*val_prev != 0] += (streak > 3);
stats->streaks[*val_prev != 0][(streak > 3)] += streak;
*val_prev = val;
*i_prev = i;
}
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t x = X[i];
if (x != x_prev) {
GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
//------------------------------------------------------------------------------
void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const int argb = argb_data[i];
const int green = (argb >> 8) & 0xff;
const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
const uint32_t new_b = (((argb >> 0) & 0xff) - green) & 0xff;
argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
}
}
static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
return ((int)color_pred * color) >> 5;
}
static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
return (int8_t)(v & 0xff);
}
void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const int8_t green = U32ToS8(argb >> 8);
const int8_t red = U32ToS8(argb >> 16);
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
uint32_t argb) {
const int8_t green = U32ToS8(argb >> 8);
int new_red = argb >> 16;
new_red -= ColorTransformDelta(green_to_red, green);
return (new_red & 0xff);
}
static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
uint8_t red_to_blue,
uint32_t argb) {
const int8_t green = U32ToS8(argb >> 8);
const int8_t red = U32ToS8(argb >> 16);
uint8_t new_blue = argb & 0xff;
new_blue -= ColorTransformDelta(green_to_blue, green);
new_blue -= ColorTransformDelta(red_to_blue, red);
return (new_blue & 0xff);
}
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
++histo[TransformColorRed((uint8_t)green_to_red, argb[x])];
}
argb += stride;
}
}
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue,
argb[x])];
}
argb += stride;
}
}
//------------------------------------------------------------------------------
static int VectorMismatch_C(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len = 0;
while (match_len < length && array1[match_len] == array2[match_len]) {
++match_len;
}
return match_len;
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
int x;
if (xbits > 0) {
const int bit_depth = 1 << (3 - xbits);
const int mask = (1 << xbits) - 1;
uint32_t code = 0xff000000;
for (x = 0; x < width; ++x) {
const int xsub = x & mask;
if (xsub == 0) {
code = 0xff000000;
}
code |= row[x] << (8 + bit_depth * xsub);
dst[x >> xbits] = code;
}
} else {
for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
}
}
//------------------------------------------------------------------------------
static double ExtraCost_C(const uint32_t* population, int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
return cost;
}
static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) {
const int xy = X[i + 2] + Y[i + 2];
cost += (i >> 1) * xy;
}
return cost;
}
//------------------------------------------------------------------------------
static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
}
static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
int i;
for (i = 0; i < size; ++i) out[i] += a[i];
}
#define ADD(X, ARG, LEN) do { \
if (a->is_used_[X]) { \
if (b->is_used_[X]) { \
VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN)); \
} else { \
memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} \
} else if (b->is_used_[X]) { \
memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} else { \
memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0])); \
} \
} while (0)
#define ADD_EQ(X, ARG, LEN) do { \
if (a->is_used_[X]) { \
if (out->is_used_[X]) { \
VP8LAddVectorEq(a->ARG, out->ARG, (LEN)); \
} else { \
memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} \
} \
} while (0)
void VP8LHistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b, VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
ADD(0, literal_, literal_size);
ADD(1, red_, NUM_LITERAL_CODES);
ADD(2, blue_, NUM_LITERAL_CODES);
ADD(3, alpha_, NUM_LITERAL_CODES);
ADD(4, distance_, NUM_DISTANCE_CODES);
for (i = 0; i < 5; ++i) {
out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
}
} else {
ADD_EQ(0, literal_, literal_size);
ADD_EQ(1, red_, NUM_LITERAL_CODES);
ADD_EQ(2, blue_, NUM_LITERAL_CODES);
ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
}
}
#undef ADD
#undef ADD_EQ
//------------------------------------------------------------------------------
// Image transforms.
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
(void)upper;
}
static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
(void)upper;
}
// It subtracts the prediction from the input pixel and stores the residual
// in the output pixel.
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = \
VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x); \
out[x] = VP8LSubPixels(in[x], pred); \
} \
}
GENERATE_PREDICTOR_SUB(2)
GENERATE_PREDICTOR_SUB(3)
GENERATE_PREDICTOR_SUB(4)
GENERATE_PREDICTOR_SUB(5)
GENERATE_PREDICTOR_SUB(6)
GENERATE_PREDICTOR_SUB(7)
GENERATE_PREDICTOR_SUB(8)
GENERATE_PREDICTOR_SUB(9)
GENERATE_PREDICTOR_SUB(10)
GENERATE_PREDICTOR_SUB(11)
GENERATE_PREDICTOR_SUB(12)
GENERATE_PREDICTOR_SUB(13)
//------------------------------------------------------------------------------
VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LTransformColorFunc VP8LTransformColor;
VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
VP8LFastLog2SlowFunc VP8LFastLog2Slow;
VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
VP8LCostFunc VP8LExtraCost;
VP8LCostCombinedFunc VP8LExtraCostCombined;
VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
VP8LAddVectorFunc VP8LAddVector;
VP8LAddVectorEqFunc VP8LAddVectorEq;
VP8LVectorMismatchFunc VP8LVectorMismatch;
VP8LBundleColorMapFunc VP8LBundleColorMap;
VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
extern void VP8LEncDspInitSSE2(void);
extern void VP8LEncDspInitSSE41(void);
extern void VP8LEncDspInitNEON(void);
extern void VP8LEncDspInitMIPS32(void);
extern void VP8LEncDspInitMIPSdspR2(void);
extern void VP8LEncDspInitMSA(void);
WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
VP8LDspInit();
#if !WEBP_NEON_OMIT_C_CODE
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LTransformColor = VP8LTransformColor_C;
#endif
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
VP8LFastLog2Slow = FastLog2Slow_C;
VP8LFastSLog2Slow = FastSLog2Slow_C;
VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
VP8LAddVector = AddVector_C;
VP8LAddVectorEq = AddVectorEq_C;
VP8LVectorMismatch = VectorMismatch_C;
VP8LBundleColorMap = VP8LBundleColorMap_C;
VP8LPredictorsSub[0] = PredictorSub0_C;
VP8LPredictorsSub[1] = PredictorSub1_C;
VP8LPredictorsSub[2] = PredictorSub2_C;
VP8LPredictorsSub[3] = PredictorSub3_C;
VP8LPredictorsSub[4] = PredictorSub4_C;
VP8LPredictorsSub[5] = PredictorSub5_C;
VP8LPredictorsSub[6] = PredictorSub6_C;
VP8LPredictorsSub[7] = PredictorSub7_C;
VP8LPredictorsSub[8] = PredictorSub8_C;
VP8LPredictorsSub[9] = PredictorSub9_C;
VP8LPredictorsSub[10] = PredictorSub10_C;
VP8LPredictorsSub[11] = PredictorSub11_C;
VP8LPredictorsSub[12] = PredictorSub12_C;
VP8LPredictorsSub[13] = PredictorSub13_C;
VP8LPredictorsSub[14] = PredictorSub0_C; // <- padding security sentinels
VP8LPredictorsSub[15] = PredictorSub0_C;
VP8LPredictorsSub_C[0] = PredictorSub0_C;
VP8LPredictorsSub_C[1] = PredictorSub1_C;
VP8LPredictorsSub_C[2] = PredictorSub2_C;
VP8LPredictorsSub_C[3] = PredictorSub3_C;
VP8LPredictorsSub_C[4] = PredictorSub4_C;
VP8LPredictorsSub_C[5] = PredictorSub5_C;
VP8LPredictorsSub_C[6] = PredictorSub6_C;
VP8LPredictorsSub_C[7] = PredictorSub7_C;
VP8LPredictorsSub_C[8] = PredictorSub8_C;
VP8LPredictorsSub_C[9] = PredictorSub9_C;
VP8LPredictorsSub_C[10] = PredictorSub10_C;
VP8LPredictorsSub_C[11] = PredictorSub11_C;
VP8LPredictorsSub_C[12] = PredictorSub12_C;
VP8LPredictorsSub_C[13] = PredictorSub13_C;
VP8LPredictorsSub_C[14] = PredictorSub0_C; // <- padding security sentinels
VP8LPredictorsSub_C[15] = PredictorSub0_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LEncDspInitSSE2();
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8LEncDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8LEncDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LEncDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8LEncDspInitMSA();
}
#endif
}
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();
}
#endif
assert(VP8LSubtractGreenFromBlueAndRed != NULL);
assert(VP8LTransformColor != NULL);
assert(VP8LCollectColorBlueTransforms != NULL);
assert(VP8LCollectColorRedTransforms != NULL);
assert(VP8LFastLog2Slow != NULL);
assert(VP8LFastSLog2Slow != NULL);
assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LAddVector != NULL);
assert(VP8LAddVectorEq != NULL);
assert(VP8LVectorMismatch != NULL);
assert(VP8LBundleColorMap != NULL);
assert(VP8LPredictorsSub[0] != NULL);
assert(VP8LPredictorsSub[1] != NULL);
assert(VP8LPredictorsSub[2] != NULL);
assert(VP8LPredictorsSub[3] != NULL);
assert(VP8LPredictorsSub[4] != NULL);
assert(VP8LPredictorsSub[5] != NULL);
assert(VP8LPredictorsSub[6] != NULL);
assert(VP8LPredictorsSub[7] != NULL);
assert(VP8LPredictorsSub[8] != NULL);
assert(VP8LPredictorsSub[9] != NULL);
assert(VP8LPredictorsSub[10] != NULL);
assert(VP8LPredictorsSub[11] != NULL);
assert(VP8LPredictorsSub[12] != NULL);
assert(VP8LPredictorsSub[13] != NULL);
assert(VP8LPredictorsSub[14] != NULL);
assert(VP8LPredictorsSub[15] != NULL);
assert(VP8LPredictorsSub_C[0] != NULL);
assert(VP8LPredictorsSub_C[1] != NULL);
assert(VP8LPredictorsSub_C[2] != NULL);
assert(VP8LPredictorsSub_C[3] != NULL);
assert(VP8LPredictorsSub_C[4] != NULL);
assert(VP8LPredictorsSub_C[5] != NULL);
assert(VP8LPredictorsSub_C[6] != NULL);
assert(VP8LPredictorsSub_C[7] != NULL);
assert(VP8LPredictorsSub_C[8] != NULL);
assert(VP8LPredictorsSub_C[9] != NULL);
assert(VP8LPredictorsSub_C[10] != NULL);
assert(VP8LPredictorsSub_C[11] != NULL);
assert(VP8LPredictorsSub_C[12] != NULL);
assert(VP8LPredictorsSub_C[13] != NULL);
assert(VP8LPredictorsSub_C[14] != NULL);
assert(VP8LPredictorsSub_C[15] != NULL);
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,397 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of lossless functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#if defined(WEBP_USE_MIPS32)
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
static float FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
const int c24 = 24;
const float v_f = (float)v;
uint32_t temp;
// Xf = 256 = 2^8
// log_cnt is index of leading one in upper 24 bits
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
correction = (23 * (v & (y - 1))) >> 4;
return v_f * (kLog2Table[temp] + log_cnt) + correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
}
}
static float FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
const int c24 = 24;
double log_2;
uint32_t temp;
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
log_2 = kLog2Table[temp] + log_cnt;
if (v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const uint32_t correction = (23 * (v & (y - 1))) >> 4;
log_2 += (double)correction / v;
}
return (float)log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
}
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pop = &population[4];
// const uint32_t* LoopEnd = &population[length];
// while (pop != LoopEnd) {
// ++i;
// cost += i * *pop;
// cost += i * *(pop + 1);
// pop += 2;
// }
// return (double)cost;
static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pop], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pop]) \n\t"
"lw %[temp1], 4(%[pop]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addiu %[pop], %[pop], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp1] \n\t"
"bne %[pop], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[i]"=&r"(i), [pop]"+r"(pop)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pX = &X[4];
// const uint32_t* pY = &Y[4];
// const uint32_t* LoopEnd = &X[length];
// while (pX != LoopEnd) {
// const uint32_t xy0 = *pX + *pY;
// const uint32_t xy1 = *(pX + 1) + *(pY + 1);
// ++i;
// cost += i * xy0;
// cost += i * xy1;
// pX += 2;
// pY += 2;
// }
// return (double)cost;
static double ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
const uint32_t* const LoopEnd = &X[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pX], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pX]) \n\t"
"lw %[temp1], 0(%[pY]) \n\t"
"lw %[temp2], 4(%[pX]) \n\t"
"lw %[temp3], 4(%[pY]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addiu %[pX], %[pX], 8 \n\t"
"addiu %[pY], %[pY], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp2] \n\t"
"bne %[pX], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
#define HUFFMAN_COST_PASS \
__asm__ volatile( \
"sll %[temp1], %[temp0], 3 \n\t" \
"addiu %[temp3], %[streak], -3 \n\t" \
"addu %[temp2], %[pstreaks], %[temp1] \n\t" \
"blez %[temp3], 1f \n\t" \
"srl %[temp1], %[temp1], 1 \n\t" \
"addu %[temp3], %[pcnts], %[temp1] \n\t" \
"lw %[temp0], 4(%[temp2]) \n\t" \
"lw %[temp1], 0(%[temp3]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"addiu %[temp1], %[temp1], 1 \n\t" \
"sw %[temp0], 4(%[temp2]) \n\t" \
"sw %[temp1], 0(%[temp3]) \n\t" \
"b 2f \n\t" \
"1: \n\t" \
"lw %[temp0], 0(%[temp2]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"sw %[temp0], 0(%[temp2]) \n\t" \
"2: \n\t" \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
: [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
[streak]"r"(streak) \
: "memory" \
);
// Returns the various RLE counts
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
int* const pstreaks = &stats->streaks[0][0];
int* const pcnts = &stats->counts[0];
int temp0, temp1, temp2, temp3;
const int streak = i - *i_prev;
// Gather info for the bit entropy.
if (*val_prev != 0) {
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
}
// Gather info for the Huffman cost.
temp0 = (*val_prev != 0);
HUFFMAN_COST_PASS
*val_prev = val;
*i_prev = i;
}
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t x = X[i];
if (x != x_prev) {
GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
#define ASM_START \
__asm__ volatile( \
".set push \n\t" \
".set at \n\t" \
".set macro \n\t" \
"1: \n\t"
// P2 = P0 + P1
// A..D - offsets
// E - temp variable to tell macro
// if pointer should be incremented
// literal_ and successive histograms could be unaligned
// so we must use ulw and usw
#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
"ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
"ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
"ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
"ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
"ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
"ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
"ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
"ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
"addu %[temp4], %[temp4], %[temp0] \n\t" \
"addu %[temp5], %[temp5], %[temp1] \n\t" \
"addu %[temp6], %[temp6], %[temp2] \n\t" \
"addu %[temp7], %[temp7], %[temp3] \n\t" \
"addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
".if " #E " == 1 \n\t" \
"addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
".endif \n\t" \
"usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
"usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
"usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
"usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
"addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
"bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
".set pop \n\t" \
#define ASM_END_COMMON_0 \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
[pa]"+r"(pa), [pout]"+r"(pout)
#define ASM_END_COMMON_1 \
: [LoopEnd]"r"(LoopEnd) \
: "memory", "at" \
);
#define ASM_END_0 \
ASM_END_COMMON_0 \
, [pb]"+r"(pb) \
ASM_END_COMMON_1
#define ASM_END_1 \
ASM_END_COMMON_0 \
ASM_END_COMMON_1
static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
uint32_t* pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;
int i;
ASM_START
ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
ASM_END_0
for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
}
static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;
int i;
ASM_START
ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
ASM_END_1
for (i = end; i < size; ++i) pout[i] += pa[i];
}
#undef ASM_END_1
#undef ASM_END_0
#undef ASM_END_COMMON_1
#undef ASM_END_COMMON_0
#undef ADD_TO_OUT
#undef ASM_START
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
VP8LFastLog2Slow = FastLog2Slow_MIPS32;
VP8LExtraCost = ExtraCost_MIPS32;
VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
VP8LAddVector = AddVector_MIPS32;
VP8LAddVectorEq = AddVectorEq_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,281 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "src/dsp/lossless.h"
static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[argb_data], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"lw %[temp1], 4(%[argb_data]) \n\t"
"lw %[temp2], 8(%[argb_data]) \n\t"
"lw %[temp3], 12(%[argb_data]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[argb_data], %[argb_data], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
"replv.ph %[temp7], %[temp7] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"subu.qb %[temp1], %[temp1], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
"subu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[argb_data]) \n\t"
"sw %[temp1], -12(%[argb_data]) \n\t"
"sw %[temp2], -8(%[argb_data]) \n\t"
"bne %[argb_data], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[argb_data]) \n\t"
"3: \n\t"
"beq %[argb_data], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"addiu %[argb_data], %[argb_data], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[argb_data], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[argb_data]) \n\t"
"2: \n\t"
".set pop \n\t"
: [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
int8_t color) {
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
"replv.ph %[temp2], %[R_to_B] \n\t"
"shll.ph %[temp0], %[temp0], 8 \n\t"
"shll.ph %[temp1], %[temp1], 8 \n\t"
"shll.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp0], %[temp0], 8 \n\t"
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[data]) \n\t"
"lw %[argb1], 4(%[data]) \n\t"
"lhu %[new_red], 2(%[data]) \n\t"
"lhu %[new_red1], 6(%[data]) \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"precr.qb.ph %[temp4], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"preceu.ph.qbla %[temp4], %[temp4] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
"shll.ph %[temp4], %[temp4], 8 \n\t"
"shra.ph %[temp3], %[temp3], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp5], %[temp3], %[temp0] \n\t"
"mul.ph %[temp3], %[temp3], %[temp1] \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"addiu %[data], %[data], 8 \n\t"
"ins %[new_red1], %[new_red], 16, 16 \n\t"
"ins %[argb1], %[argb], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp3], %[temp3], 5 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"subu.ph %[new_red1], %[new_red1], %[temp5] \n\t"
"subu.ph %[argb1], %[argb1], %[temp3] \n\t"
"preceu.ph.qbra %[temp5], %[new_red1] \n\t"
"subu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -2(%[data]) \n\t"
"sb %[temp3], -4(%[data]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"sb %[temp5], -6(%[data]) \n\t"
"bne %[data], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
[argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
if (num_pixels & 1) {
const uint32_t argb_ = data[0];
const uint32_t green = argb_ >> 8;
const uint32_t red = argb_ >> 16;
uint32_t new_blue = argb_;
new_red = red;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
uint8_t red_to_blue,
uint32_t argb) {
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint8_t new_blue = argb;
new_blue -= ColorTransformDelta(green_to_blue, green);
new_blue -= ColorTransformDelta(red_to_blue, red);
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_blue,
int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precr.qb.ph %[temp2], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp5], %[temp2], %[rtb] \n\t"
"mul.ph %[temp6], %[temp3], %[gtb] \n\t"
"and %[temp4], %[temp1], %[mask] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp6], %[temp6], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
: [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
}
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
uint32_t argb) {
const uint32_t green = argb >> 8;
uint32_t new_red = argb >> 16;
new_red -= ColorTransformDelta(green_to_red, green);
return (new_red & 0xff);
}
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_red,
int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precrq.ph.w %[temp4], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp2], %[temp3], %[gtr] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp2], %[temp2], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp2] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
: [gtr]"r"(gtr)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorRed(green_to_red, *p_argb)];
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
VP8LTransformColor = TransformColor_MIPSdspR2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,148 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA variant of Image transform methods for lossless encoder.
//
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "src/dsp/lossless.h"
#include "src/dsp/msa_macro.h"
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
v4i32 t4, t5; \
VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
SRAI_H2_SH(t0, t1, 5); \
t0 = __msa_subv_h((v8i16)src0, t0); \
t1 = __msa_subv_h((v8i16)src1, t1); \
t4 = __msa_srli_w((v4i32)src0, 16); \
t5 = __msa_srli_w((v4i32)src1, 16); \
DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
SRAI_H2_SH(t2, t3, 5); \
SUB2(t0, t2, t1, t3, t0, t1); \
VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
} while (0)
#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do { \
const v16i8 g0 = VSHF_SB(src, src, mask0); \
v8i16 t0 = __msa_dotp_s_h(c0, g0); \
v8i16 t1; \
v4i32 t2; \
t0 = SRAI_H(t0, 5); \
t0 = __msa_subv_h((v8i16)src, t0); \
t2 = __msa_srli_w((v4i32)src, 16); \
t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
t1 = SRAI_H(t1, 5); \
t0 = t0 - t1; \
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
28, 13, 30, 15 };
while (num_pixels >= 8) {
v16u8 src1, dst1;
LD_UB2(data, 4, src0, src1);
TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
ST_UB2(dst0, dst1, data, 4);
data += 8;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(data);
TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
ST_UB(dst0, data);
data += 4;
num_pixels -= 4;
}
if (num_pixels > 0) {
src0 = LD_UB(data);
TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
if (num_pixels == 3) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
SD(pix_d, data + 0);
SW(pix_w, data + 2);
} else if (num_pixels == 2) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
SD(pix_d, data);
} else {
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
SW(pix_w, data);
}
}
}
}
static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
int num_pixels) {
int i;
uint8_t* ptemp_data = (uint8_t*)argb_data;
v16u8 src0, dst0, tmp0;
const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
while (num_pixels >= 8) {
v16u8 src1, dst1, tmp1;
LD_UB2(ptemp_data, 16, src0, src1);
VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
ST_UB2(dst0, dst1, ptemp_data, 16);
ptemp_data += 8 * 4;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(ptemp_data);
tmp0 = VSHF_UB(src0, src0, mask);
dst0 = src0 - tmp0;
ST_UB(dst0, ptemp_data);
ptemp_data += 4 * 4;
num_pixels -= 4;
}
for (i = 0; i < num_pixels; i++) {
const uint8_t b = ptemp_data[0];
const uint8_t g = ptemp_data[1];
const uint8_t r = ptemp_data[2];
ptemp_data[0] = (b - g) & 0xff;
ptemp_data[2] = (r - g) & 0xff;
ptemp_data += 4;
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
VP8LTransformColor = TransformColor_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,144 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "src/dsp/lossless.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ
#endif
#ifdef USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
#else // !USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor_NEON(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_)
};
const int16x8_t mults_rb = vld1q_s16(rb);
const int16_t b2[8] = {
0, CST(red_to_blue_), 0, CST(red_to_blue_),
0, CST(red_to_blue_), 0, CST(red_to_blue_),
};
const int16x8_t mults_b2 = vld1q_s16(b2);
#undef CST
#ifdef USE_VTBLQ
static const uint8_t kg0g0[16] = {
255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
};
const uint8x16_t shuffle = vld1q_u8(kg0g0);
#else
static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
const uint8x8_t shuffle = vld1_u8(k0g0g);
#endif
const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
// x db2 0 0
const int16x8_t C = vqdmulhq_s16(B, mults_b2);
// 0 0 x db2
const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
// x dr x db
const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
vreinterpretq_s8_s16(A));
// 0 dr 0 db
const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
vreinterpretq_s8_u32(F));
vst1q_s8((int8_t*)(argb_data + i), out);
}
// fallthrough and finish off with plain-C
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
#undef USE_VTBLQ
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
VP8LTransformColor = TransformColor_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,669 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "src/dsp/lossless.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless_common.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_sub_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
// Color Transform
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
const __m128i H = _mm_add_epi8(G, D); // x dr x db
const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
const __m128i out = _mm_sub_epi8(in, I);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0
const __m128i A1 = _mm_slli_epi16(in1, 8);
const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i B1 = _mm_and_si128(in1, mask_g);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0
const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db
const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b'
const __m128i E1 = _mm_sub_epi8(in1, D1);
const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db
const __m128i F1 = _mm_srli_epi32(C1, 16);
const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b'
const __m128i G1 = _mm_sub_epi8(E1, F1);
const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b
const __m128i H1 = _mm_and_si128(G1, mask_b);
const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i A1 = _mm_and_si128(in1, mask_g);
const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i B1 = _mm_srli_epi32(in1, 16);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr
const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r'
const __m128i E1 = _mm_sub_epi8(B1, C1);
const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r'
const __m128i F1 = _mm_and_si128(E1, mask);
const __m128i I = _mm_packs_epi32(F0, F1);
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_red, histo);
}
}
}
#undef SPAN
#undef MK_CST_16
//------------------------------------------------------------------------------
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
#define LINE_SIZE 16 // 8 or 16
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
for (; i < size; ++i) {
out[i] = a[i] + b[i];
}
}
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
for (; i < size; ++i) {
out[i] += a[i];
}
}
#undef LINE_SIZE
//------------------------------------------------------------------------------
// Entropy
// TODO(https://crbug.com/webp/499): this function produces different results
// from the C code due to use of double/float resulting in output differences
// when compared to -noasm.
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i < 256; i += 16) {
const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
_mm_packs_epi32(x2, x3));
const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
_mm_packs_epi32(y2, y3));
const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
while (my) {
const int32_t j = BitsCtz(my);
int xy;
if ((mx >> j) & 1) {
const int x = X[i + j];
sumXY += x;
retval -= VP8LFastSLog2(x);
}
xy = X[i + j] + Y[i + j];
sumX += xy;
retval -= VP8LFastSLog2(xy);
my &= my - 1;
}
}
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
#else
#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
#endif
//------------------------------------------------------------------------------
static int VectorMismatch_SSE2(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len;
if (length >= 12) {
__m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
__m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
match_len = 0;
do {
// Loop unrolling and early load both provide a speedup of 10% for the
// current function. Also, max_limit can be MAX_LENGTH=4096 at most.
const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
const __m128i B0 =
_mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
const __m128i B1 =
_mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
if (_mm_movemask_epi8(cmpA) != 0xffff) break;
match_len += 4;
{
const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
if (_mm_movemask_epi8(cmpB) != 0xffff) break;
match_len += 4;
}
} while (match_len + 12 < length);
} else {
match_len = 0;
// Unroll the potential first two loops.
if (length >= 4 &&
_mm_movemask_epi8(_mm_cmpeq_epi32(
_mm_loadu_si128((const __m128i*)&array1[0]),
_mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
match_len = 4;
if (length >= 8 &&
_mm_movemask_epi8(_mm_cmpeq_epi32(
_mm_loadu_si128((const __m128i*)&array1[4]),
_mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
match_len = 8;
}
}
}
while (match_len < length && array1[match_len] == array2[match_len]) {
++match_len;
}
return match_len;
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
int x;
assert(xbits >= 0);
assert(xbits <= 3);
switch (xbits) {
case 0: {
const __m128i ff = _mm_set1_epi16((short)0xff00);
const __m128i zero = _mm_setzero_si128();
// Store 0xff000000 | (row[x] << 8).
for (x = 0; x + 16 <= width; x += 16, dst += 16) {
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
_mm_storeu_si128((__m128i*)&dst[0], dst0);
_mm_storeu_si128((__m128i*)&dst[4], dst1);
_mm_storeu_si128((__m128i*)&dst[8], dst2);
_mm_storeu_si128((__m128i*)&dst[12], dst3);
}
break;
}
case 1: {
const __m128i ff = _mm_set1_epi16((short)0xff00);
const __m128i mul = _mm_set1_epi16(0x110);
for (x = 0; x + 16 <= width; x += 16, dst += 8) {
// 0a0b | (where a/b are 4 bits).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
const __m128i pack = _mm_and_si128(tmp, ff); // ab00
const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
_mm_storeu_si128((__m128i*)&dst[0], dst0);
_mm_storeu_si128((__m128i*)&dst[4], dst1);
}
break;
}
case 2: {
const __m128i mask_or = _mm_set1_epi32(0xff000000);
const __m128i mul_cst = _mm_set1_epi16(0x0104);
const __m128i mask_mul = _mm_set1_epi16(0x0f00);
for (x = 0; x + 16 <= width; x += 16, dst += 4) {
// 000a000b000c000d | (where a/b/c/d are 2 bits).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000
const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000
// Convert to 0xff00**00.
const __m128i res = _mm_or_si128(pack, mask_or);
_mm_storeu_si128((__m128i*)dst, res);
}
break;
}
default: {
assert(xbits == 3);
for (x = 0; x + 16 <= width; x += 16, dst += 2) {
// 0000000a00000000b... | (where a/b are 1 bit).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i shift = _mm_slli_epi64(in, 7);
const uint32_t move = _mm_movemask_epi8(shift);
dst[0] = 0xff000000 | ((move & 0xff) << 8);
dst[1] = 0xff000000 | (move & 0xff00);
}
break;
}
}
if (x != width) {
VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
}
}
//------------------------------------------------------------------------------
// Batch version of Predictor Transform subtraction
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
const __m128i* const a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
*avg = _mm_sub_epi8(avg1, one);
}
// Predictor0: ARGB_BLACK.
static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i black = _mm_set1_epi32(ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i res = _mm_sub_epi8(src, black);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
}
(void)upper;
}
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* const in, \
const uint32_t* const upper, \
int num_pixels, uint32_t* const out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
num_pixels - i, out + i); \
} \
}
GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR
GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
#undef GENERATE_PREDICTOR_1
// Predictor5: avg2(avg2(L, TR), T)
static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i avg, pred, res;
Average2_m128i(&L, &TR, &avg);
Average2_m128i(&avg, &T, &pred);
res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
}
}
#define GENERATE_PREDICTOR_2(X, A, B) \
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
__m128i pred, res; \
Average2_m128i(&tA, &tB, &pred); \
res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
} \
}
GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL)
GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T)
GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T)
GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
#undef GENERATE_PREDICTOR_2
// Predictor10: avg(avg(L,TL), avg(T, TR)).
static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR, avgLTL, avg, res;
Average2_m128i(&T, &TR, &avgTTR);
Average2_m128i(&L, &TL, &avgLTL);
Average2_m128i(&avgTTR, &avgLTL, &avg);
res = _mm_sub_epi8(src, avg);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor11: select.
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
*out = _mm_packs_epi32(s_lo, s_hi);
}
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
const __m128i B = _mm_andnot_si128(mask, T);
const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
}
if (i != num_pixels) {
VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor12: ClampedSubSubtractFull.
static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictors13: ClampedAddSubtractHalf
static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 2 <= num_pixels; i += 2) {
// we can only process two pixels at a time
const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i sum = _mm_add_epi16(T_lo, L_lo);
const __m128i avg = _mm_srli_epi16(sum, 1);
const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
const __m128i A3 = _mm_srai_epi16(A2, 1);
const __m128i A4 = _mm_add_epi16(avg, A3);
const __m128i pred = _mm_packus_epi16(A4, A4);
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storel_epi64((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
VP8LTransformColor = TransformColor_SSE2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LAddVector = AddVector_SSE2;
VP8LAddVectorEq = AddVectorEq_SSE2;
#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
#endif
VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
VP8LPredictorsSub[1] = PredictorSub1_SSE2;
VP8LPredictorsSub[2] = PredictorSub2_SSE2;
VP8LPredictorsSub[3] = PredictorSub3_SSE2;
VP8LPredictorsSub[4] = PredictorSub4_SSE2;
VP8LPredictorsSub[5] = PredictorSub5_SSE2;
VP8LPredictorsSub[6] = PredictorSub6_SSE2;
VP8LPredictorsSub[7] = PredictorSub7_SSE2;
VP8LPredictorsSub[8] = PredictorSub8_SSE2;
VP8LPredictorsSub[9] = PredictorSub9_SSE2;
VP8LPredictorsSub[10] = PredictorSub10_SSE2;
VP8LPredictorsSub[11] = PredictorSub11_SSE2;
VP8LPredictorsSub[12] = PredictorSub12_SSE2;
VP8LPredictorsSub[13] = PredictorSub13_SSE2;
VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels
VP8LPredictorsSub[15] = PredictorSub0_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,155 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4.1 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include "src/dsp/lossless.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
const __m128i out = _mm_sub_epi8(in, in_0g0g);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
// Color Transform
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mult =
MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
const __m128i perm =
_mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
if (tile_width >= 4) {
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
const __m128i B1 = _mm_shuffle_epi8(A1, perm);
const __m128i C1 = _mm_mulhi_epi16(B1, mult);
const __m128i D1 = _mm_sub_epi16(A1, C1);
__m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
int x;
for (x = 4; x + 4 <= tile_width; x += 4) {
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
__m128i B2, C2, D2;
++histo[_mm_extract_epi8(E, 0)];
B2 = _mm_shuffle_epi8(A2, perm);
++histo[_mm_extract_epi8(E, 4)];
C2 = _mm_mulhi_epi16(B2, mult);
++histo[_mm_extract_epi8(E, 8)];
D2 = _mm_sub_epi16(A2, C2);
++histo[_mm_extract_epi8(E, 12)];
E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
}
++histo[_mm_extract_epi8(E, 0)];
++histo[_mm_extract_epi8(E, 4)];
++histo[_mm_extract_epi8(E, 8)];
++histo[_mm_extract_epi8(E, 12)];
}
}
{
const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
if (tile_width >= 4) {
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
const __m128i B1 = _mm_and_si128(A1, mask_g);
const __m128i C1 = _mm_madd_epi16(B1, mult);
__m128i D = _mm_sub_epi16(A1, C1);
int x;
for (x = 4; x + 4 <= tile_width; x += 4) {
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
__m128i B2, C2;
++histo[_mm_extract_epi8(D, 2)];
B2 = _mm_and_si128(A2, mask_g);
++histo[_mm_extract_epi8(D, 6)];
C2 = _mm_madd_epi16(B2, mult);
++histo[_mm_extract_epi8(D, 10)];
++histo[_mm_extract_epi8(D, 14)];
D = _mm_sub_epi16(A2, C2);
}
++histo[_mm_extract_epi8(D, 2)];
++histo[_mm_extract_epi8(D, 6)];
++histo[_mm_extract_epi8(D, 10)];
++histo[_mm_extract_epi8(D, 14)];
}
}
{
const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height, green_to_red,
histo);
}
}
}
#undef MK_CST_16
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
#endif // WEBP_USE_SSE41

View File

@ -9,11 +9,15 @@ with Files('**'):
SOURCES += [
'alpha_processing.c',
'cost.c',
'dec.c',
'dec_clip_tables.c',
'enc.c',
'filters.c',
'lossless.c',
'lossless_enc.c',
'rescaler.c',
'ssim.c',
'upsampling.c',
'yuv.c',
]
@ -25,8 +29,11 @@ LOCAL_INCLUDES += [
if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
SOURCES += [
'alpha_processing_neon.c',
'cost_neon.c',
'dec_neon.c',
'enc_neon.c',
'filters_neon.c',
'lossless_enc_neon.c',
'lossless_neon.c',
'rescaler_neon.c',
'upsampling_neon.c',
@ -39,8 +46,11 @@ if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
elif CONFIG['CPU_ARCH'] == 'aarch64':
SOURCES += [
'alpha_processing_neon.c',
'cost_neon.c',
'dec_neon.c',
'enc_neon.c',
'filters_neon.c',
'lossless_enc_neon.c',
'lossless_neon.c',
'rescaler_neon.c',
'upsampling_neon.c',
@ -51,12 +61,18 @@ elif CONFIG['INTEL_ARCHITECTURE']:
SOURCES += [
'alpha_processing_sse2.c',
'alpha_processing_sse41.c',
'cost_sse2.c',
'dec_sse2.c',
'dec_sse41.c',
'enc_sse2.c',
'enc_sse41.c',
'filters_sse2.c',
'lossless_enc_sse2.c',
'lossless_enc_sse41.c',
'lossless_sse2.c',
'lossless_sse41.c',
'rescaler_sse2.c',
'ssim_sse2.c',
'upsampling_sse2.c',
'upsampling_sse41.c',
'yuv_sse2.c',
@ -72,9 +88,15 @@ elif CONFIG['INTEL_ARCHITECTURE']:
elif CONFIG['CPU_ARCH'].startswith('mips'):
SOURCES += [
'alpha_processing_mips_dsp_r2.c',
'cost_mips32.c',
'cost_mips_dsp_r2.c',
'dec_mips32.c',
'dec_mips_dsp_r2.c',
'enc_mips32.c',
'enc_mips_dsp_r2.c',
'filters_mips_dsp_r2.c',
'lossless_enc_mips32.c',
'lossless_enc_mips_dsp_r2.c',
'lossless_mips_dsp_r2.c',
'lossless_msa.c',
'rescaler_mips32.c',

View File

@ -0,0 +1,159 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "src/dsp/dsp.h"
#if !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
#if !defined(WEBP_DISABLE_STATS)
static uint32_t AccumulateSSE_C(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif
//------------------------------------------------------------------------------
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
extern void VP8SSIMDspInitSSE2(void);
WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
}

View File

@ -0,0 +1,165 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "src/dsp/common_sse2.h"
#if !defined(WEBP_DISABLE_STATS)
// Helper function
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
// zero-extend to 16b
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
// multiply with self
const __m128i sum1 = _mm_madd_epi16(C0, C0);
const __m128i sum2 = _mm_madd_epi16(C1, C1);
*sum = _mm_add_epi32(sum1, sum2);
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndSquare_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif // !defined(WEBP_DISABLE_STATS)
#if !defined(WEBP_REDUCE_SIZE)
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b_SSE2(&xm);
stats.ym = HorizontalAdd16b_SSE2(&ym);
stats.xxm = HorizontalAdd32b_SSE2(&xxm);
stats.xym = HorizontalAdd32b_SSE2(&xym);
stats.yym = HorizontalAdd32b_SSE2(&yym);
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_SSE2;
#endif
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGet = SSIMGet_SSE2;
#endif
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,443 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha-plane compression.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "src/enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/utils/filters_utils.h"
#include "src/utils/quant_levels_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
// -----------------------------------------------------------------------------
// Encodes the given alpha data via specified compression method 'method'.
// The pre-processing (quantization) is performed if 'quality' is less than 100.
// For such cases, the encoding is lossy. The valid range is [0, 100] for
// 'quality' and [0, 1] for 'method':
// 'method = 0' - No compression;
// 'method = 1' - Use lossless coder on the alpha plane only
// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
// vertical & gradient filters. The prediction mode 4 will try all the
// prediction modes 0 to 3 and pick the best one.
// 'effort_level': specifies how much effort must be spent to try and reduce
// the compressed output size. In range 0 (quick) to 6 (slow).
//
// 'output' corresponds to the buffer containing compressed alpha data.
// This buffer is allocated by this method and caller should call
// WebPSafeFree(*output) when done.
// 'output_size' corresponds to size of this compressed alpha buffer.
//
// Returns 1 on successfully encoding the alpha and
// 0 if either:
// invalid quality or method, or
// memory allocation for the compressed data fails.
#include "src/enc/vp8li_enc.h"
static int EncodeLossless(const uint8_t* const data, int width, int height,
int effort_level, // in [0..6] range
int use_quality_100, VP8LBitWriter* const bw,
WebPAuxStats* const stats) {
int ok = 0;
WebPConfig config;
WebPPicture picture;
WebPPictureInit(&picture);
picture.width = width;
picture.height = height;
picture.use_argb = 1;
picture.stats = stats;
if (!WebPPictureAlloc(&picture)) return 0;
// Transfer the alpha values to the green channel.
WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
picture.argb, picture.argb_stride);
WebPConfigInit(&config);
config.lossless = 1;
// Enable exact, or it would alter RGB values of transparent alpha, which is
// normally OK but not here since we are not encoding the input image but an
// internal encoding-related image containing necessary exact information in
// RGB channels.
config.exact = 1;
config.method = effort_level; // impact is very small
// Set a low default quality for encoding alpha. Ensure that Alpha quality at
// lower methods (3 and below) is less than the threshold for triggering
// costly 'BackwardReferencesTraceBackwards'.
// If the alpha quality is set to 100 and the method to 6, allow for a high
// lossless quality to trigger the cruncher.
config.quality =
(use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
assert(config.quality >= 0 && config.quality <= 100.f);
// TODO(urvang): Temporary fix to avoid generating images that trigger
// a decoder bug related to alpha with color cache.
// See: https://code.google.com/p/webp/issues/detail?id=239
// Need to re-enable this later.
ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
WebPPictureFree(&picture);
ok = ok && !bw->error_;
if (!ok) {
VP8LBitWriterWipeOut(bw);
return 0;
}
return 1;
}
// -----------------------------------------------------------------------------
// Small struct to hold the result of a filter mode compression attempt.
typedef struct {
size_t score;
VP8BitWriter bw;
WebPAuxStats stats;
} FilterTrial;
// This function always returns an initialized 'bw' object, even upon error.
static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
int method, int filter, int reduce_levels,
int effort_level, // in [0..6] range
uint8_t* const tmp_alpha,
FilterTrial* result) {
int ok = 0;
const uint8_t* alpha_src;
WebPFilterFunc filter_func;
uint8_t header;
const size_t data_size = width * height;
const uint8_t* output = NULL;
size_t output_size = 0;
VP8LBitWriter tmp_bw;
assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
assert(filter >= 0 && filter < WEBP_FILTER_LAST);
assert(method >= ALPHA_NO_COMPRESSION);
assert(method <= ALPHA_LOSSLESS_COMPRESSION);
assert(sizeof(header) == ALPHA_HEADER_LEN);
filter_func = WebPFilters[filter];
if (filter_func != NULL) {
filter_func(data, width, height, width, tmp_alpha);
alpha_src = tmp_alpha;
} else {
alpha_src = data;
}
if (method != ALPHA_NO_COMPRESSION) {
ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
!reduce_levels, &tmp_bw, &result->stats);
if (ok) {
output = VP8LBitWriterFinish(&tmp_bw);
output_size = VP8LBitWriterNumBytes(&tmp_bw);
if (output_size > data_size) {
// compressed size is larger than source! Revert to uncompressed mode.
method = ALPHA_NO_COMPRESSION;
VP8LBitWriterWipeOut(&tmp_bw);
}
} else {
VP8LBitWriterWipeOut(&tmp_bw);
return 0;
}
}
if (method == ALPHA_NO_COMPRESSION) {
output = alpha_src;
output_size = data_size;
ok = 1;
}
// Emit final result.
header = method | (filter << 2);
if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
if (method != ALPHA_NO_COMPRESSION) {
VP8LBitWriterWipeOut(&tmp_bw);
}
ok = ok && !result->bw.error_;
result->score = VP8BitWriterSize(&result->bw);
return ok;
}
// -----------------------------------------------------------------------------
static int GetNumColors(const uint8_t* data, int width, int height,
int stride) {
int j;
int colors = 0;
uint8_t color[256] = { 0 };
for (j = 0; j < height; ++j) {
int i;
const uint8_t* const p = data + j * stride;
for (i = 0; i < width; ++i) {
color[p[i]] = 1;
}
}
for (j = 0; j < 256; ++j) {
if (color[j] > 0) ++colors;
}
return colors;
}
#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
// Given the input 'filter' option, return an OR'd bit-set of filters to try.
static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
int filter, int effort_level) {
uint32_t bit_map = 0U;
if (filter == WEBP_FILTER_FAST) {
// Quick estimate of the best candidate.
int try_filter_none = (effort_level > 3);
const int kMinColorsForFilterNone = 16;
const int kMaxColorsForFilterNone = 192;
const int num_colors = GetNumColors(alpha, width, height, width);
// For low number of colors, NONE yields better compression.
filter = (num_colors <= kMinColorsForFilterNone)
? WEBP_FILTER_NONE
: WebPEstimateBestFilter(alpha, width, height, width);
bit_map |= 1 << filter;
// For large number of colors, try FILTER_NONE in addition to the best
// filter as well.
if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
bit_map |= FILTER_TRY_NONE;
}
} else if (filter == WEBP_FILTER_NONE) {
bit_map = FILTER_TRY_NONE;
} else { // WEBP_FILTER_BEST -> try all
bit_map = FILTER_TRY_ALL;
}
return bit_map;
}
static void InitFilterTrial(FilterTrial* const score) {
score->score = (size_t)~0U;
VP8BitWriterInit(&score->bw, 0);
}
static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
size_t data_size, int method, int filter,
int reduce_levels, int effort_level,
uint8_t** const output,
size_t* const output_size,
WebPAuxStats* const stats) {
int ok = 1;
FilterTrial best;
uint32_t try_map =
GetFilterMap(alpha, width, height, filter, effort_level);
InitFilterTrial(&best);
if (try_map != FILTER_TRY_NONE) {
uint8_t* filtered_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
if (filtered_alpha == NULL) return 0;
for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
if (try_map & 1) {
FilterTrial trial;
ok = EncodeAlphaInternal(alpha, width, height, method, filter,
reduce_levels, effort_level, filtered_alpha,
&trial);
if (ok && trial.score < best.score) {
VP8BitWriterWipeOut(&best.bw);
best = trial;
} else {
VP8BitWriterWipeOut(&trial.bw);
}
}
}
WebPSafeFree(filtered_alpha);
} else {
ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
reduce_levels, effort_level, NULL, &best);
}
if (ok) {
#if !defined(WEBP_DISABLE_STATS)
if (stats != NULL) {
stats->lossless_features = best.stats.lossless_features;
stats->histogram_bits = best.stats.histogram_bits;
stats->transform_bits = best.stats.transform_bits;
stats->cache_bits = best.stats.cache_bits;
stats->palette_size = best.stats.palette_size;
stats->lossless_size = best.stats.lossless_size;
stats->lossless_hdr_size = best.stats.lossless_hdr_size;
stats->lossless_data_size = best.stats.lossless_data_size;
}
#else
(void)stats;
#endif
*output_size = VP8BitWriterSize(&best.bw);
*output = VP8BitWriterBuf(&best.bw);
} else {
VP8BitWriterWipeOut(&best.bw);
}
return ok;
}
static int EncodeAlpha(VP8Encoder* const enc,
int quality, int method, int filter,
int effort_level,
uint8_t** const output, size_t* const output_size) {
const WebPPicture* const pic = enc->pic_;
const int width = pic->width;
const int height = pic->height;
uint8_t* quant_alpha = NULL;
const size_t data_size = width * height;
uint64_t sse = 0;
int ok = 1;
const int reduce_levels = (quality < 100);
// quick correctness checks
assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
assert(enc != NULL && pic != NULL && pic->a != NULL);
assert(output != NULL && output_size != NULL);
assert(width > 0 && height > 0);
assert(pic->a_stride >= width);
assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
if (quality < 0 || quality > 100) {
return 0;
}
if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
return 0;
}
if (method == ALPHA_NO_COMPRESSION) {
// Don't filter, as filtering will make no impact on compressed size.
filter = WEBP_FILTER_NONE;
}
quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
if (quant_alpha == NULL) {
return 0;
}
// Extract alpha data (width x height) from raw_data (stride x height).
WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
if (reduce_levels) { // No Quantization required for 'quality = 100'.
// 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
// mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
// and Quality:]70, 100] -> Levels:]16, 256].
const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
: (16 + (quality - 70) * 8);
ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
}
if (ok) {
VP8FiltersInit();
ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
filter, reduce_levels, effort_level, output,
output_size, pic->stats);
#if !defined(WEBP_DISABLE_STATS)
if (pic->stats != NULL) { // need stats?
pic->stats->coded_size += (int)(*output_size);
enc->sse_[3] = sse;
}
#endif
}
WebPSafeFree(quant_alpha);
return ok;
}
//------------------------------------------------------------------------------
// Main calls
static int CompressAlphaJob(void* arg1, void* unused) {
VP8Encoder* const enc = (VP8Encoder*)arg1;
const WebPConfig* config = enc->config_;
uint8_t* alpha_data = NULL;
size_t alpha_size = 0;
const int effort_level = config->method; // maps to [0..6]
const WEBP_FILTER_TYPE filter =
(config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
(config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
WEBP_FILTER_BEST;
if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
filter, effort_level, &alpha_data, &alpha_size)) {
return 0;
}
if (alpha_size != (uint32_t)alpha_size) { // Soundness check.
WebPSafeFree(alpha_data);
return 0;
}
enc->alpha_data_size_ = (uint32_t)alpha_size;
enc->alpha_data_ = alpha_data;
(void)unused;
return 1;
}
void VP8EncInitAlpha(VP8Encoder* const enc) {
WebPInitAlphaProcessing();
enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
WebPGetWorkerInterface()->Init(worker);
worker->data1 = enc;
worker->data2 = NULL;
worker->hook = CompressAlphaJob;
}
}
int VP8EncStartAlpha(VP8Encoder* const enc) {
if (enc->has_alpha_) {
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
// Makes sure worker is good to go.
if (!WebPGetWorkerInterface()->Reset(worker)) {
return 0;
}
WebPGetWorkerInterface()->Launch(worker);
return 1;
} else {
return CompressAlphaJob(enc, NULL); // just do the job right away
}
}
return 1;
}
int VP8EncFinishAlpha(VP8Encoder* const enc) {
if (enc->has_alpha_) {
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
if (!WebPGetWorkerInterface()->Sync(worker)) return 0; // error
}
}
return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
}
int VP8EncDeleteAlpha(VP8Encoder* const enc) {
int ok = 1;
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
// finish anything left in flight
ok = WebPGetWorkerInterface()->Sync(worker);
// still need to end the worker, even if !ok
WebPGetWorkerInterface()->End(worker);
}
WebPSafeFree(enc->alpha_data_);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
enc->has_alpha_ = 0;
return ok;
}

View File

@ -0,0 +1,475 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Macroblock analysis
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "src/enc/vp8i_enc.h"
#include "src/enc/cost_enc.h"
#include "src/utils/utils.h"
#define MAX_ITERS_K_MEANS 6
//------------------------------------------------------------------------------
// Smooth the segment map by replacing isolated block by the majority of its
// neighbours.
static void SmoothSegmentMap(VP8Encoder* const enc) {
int n, x, y;
const int w = enc->mb_w_;
const int h = enc->mb_h_;
const int majority_cnt_3_x_3_grid = 5;
uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
assert((uint64_t)(w * h) == (uint64_t)w * h); // no overflow, as per spec
if (tmp == NULL) return;
for (y = 1; y < h - 1; ++y) {
for (x = 1; x < w - 1; ++x) {
int cnt[NUM_MB_SEGMENTS] = { 0 };
const VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
int majority_seg = mb->segment_;
// Check the 8 neighbouring segment values.
cnt[mb[-w - 1].segment_]++; // top-left
cnt[mb[-w + 0].segment_]++; // top
cnt[mb[-w + 1].segment_]++; // top-right
cnt[mb[ - 1].segment_]++; // left
cnt[mb[ + 1].segment_]++; // right
cnt[mb[ w - 1].segment_]++; // bottom-left
cnt[mb[ w + 0].segment_]++; // bottom
cnt[mb[ w + 1].segment_]++; // bottom-right
for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
if (cnt[n] >= majority_cnt_3_x_3_grid) {
majority_seg = n;
break;
}
}
tmp[x + y * w] = majority_seg;
}
}
for (y = 1; y < h - 1; ++y) {
for (x = 1; x < w - 1; ++x) {
VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
mb->segment_ = tmp[x + y * w];
}
}
WebPSafeFree(tmp);
}
//------------------------------------------------------------------------------
// set segment susceptibility alpha_ / beta_
static WEBP_INLINE int clip(int v, int m, int M) {
return (v < m) ? m : (v > M) ? M : v;
}
static void SetSegmentAlphas(VP8Encoder* const enc,
const int centers[NUM_MB_SEGMENTS],
int mid) {
const int nb = enc->segment_hdr_.num_segments_;
int min = centers[0], max = centers[0];
int n;
if (nb > 1) {
for (n = 0; n < nb; ++n) {
if (min > centers[n]) min = centers[n];
if (max < centers[n]) max = centers[n];
}
}
if (max == min) max = min + 1;
assert(mid <= max && mid >= min);
for (n = 0; n < nb; ++n) {
const int alpha = 255 * (centers[n] - mid) / (max - min);
const int beta = 255 * (centers[n] - min) / (max - min);
enc->dqm_[n].alpha_ = clip(alpha, -127, 127);
enc->dqm_[n].beta_ = clip(beta, 0, 255);
}
}
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
#define MAX_ALPHA 255 // 8b of precision for susceptibilities.
#define ALPHA_SCALE (2 * MAX_ALPHA) // scaling factor for alpha.
#define DEFAULT_ALPHA (-1)
#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
static int FinalAlphaValue(int alpha) {
alpha = MAX_ALPHA - alpha;
return clip(alpha, 0, MAX_ALPHA);
}
static int GetAlpha(const VP8Histogram* const histo) {
// 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
// values which happen to be mostly noise. This leaves the maximum precision
// for handling the useful small values which contribute most.
const int max_value = histo->max_value;
const int last_non_zero = histo->last_non_zero;
const int alpha =
(max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
return alpha;
}
static void InitHistogram(VP8Histogram* const histo) {
histo->max_value = 0;
histo->last_non_zero = 1;
}
//------------------------------------------------------------------------------
// Simplified k-Means, to assign Nb segments based on alpha-histogram
static void AssignSegments(VP8Encoder* const enc,
const int alphas[MAX_ALPHA + 1]) {
// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
// explicit check is needed to avoid spurious warning about 'n + 1' exceeding
// array bounds of 'centers' with some compilers (noticed with gcc-4.9).
const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
int centers[NUM_MB_SEGMENTS];
int weighted_average = 0;
int map[MAX_ALPHA + 1];
int a, n, k;
int min_a = 0, max_a = MAX_ALPHA, range_a;
// 'int' type is ok for histo, and won't overflow
int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
assert(nb >= 1);
assert(nb <= NUM_MB_SEGMENTS);
// bracket the input
for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
min_a = n;
for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
max_a = n;
range_a = max_a - min_a;
// Spread initial centers evenly
for (k = 0, n = 1; k < nb; ++k, n += 2) {
assert(n < 2 * nb);
centers[k] = min_a + (n * range_a) / (2 * nb);
}
for (k = 0; k < MAX_ITERS_K_MEANS; ++k) { // few iters are enough
int total_weight;
int displaced;
// Reset stats
for (n = 0; n < nb; ++n) {
accum[n] = 0;
dist_accum[n] = 0;
}
// Assign nearest center for each 'a'
n = 0; // track the nearest center for current 'a'
for (a = min_a; a <= max_a; ++a) {
if (alphas[a]) {
while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
n++;
}
map[a] = n;
// accumulate contribution into best centroid
dist_accum[n] += a * alphas[a];
accum[n] += alphas[a];
}
}
// All point are classified. Move the centroids to the
// center of their respective cloud.
displaced = 0;
weighted_average = 0;
total_weight = 0;
for (n = 0; n < nb; ++n) {
if (accum[n]) {
const int new_center = (dist_accum[n] + accum[n] / 2) / accum[n];
displaced += abs(centers[n] - new_center);
centers[n] = new_center;
weighted_average += new_center * accum[n];
total_weight += accum[n];
}
}
weighted_average = (weighted_average + total_weight / 2) / total_weight;
if (displaced < 5) break; // no need to keep on looping...
}
// Map each original value to the closest centroid
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
VP8MBInfo* const mb = &enc->mb_info_[n];
const int alpha = mb->alpha_;
mb->segment_ = map[alpha];
mb->alpha_ = centers[map[alpha]]; // for the record.
}
if (nb > 1) {
const int smooth = (enc->config_->preprocessing & 1);
if (smooth) SmoothSegmentMap(enc);
}
SetSegmentAlphas(enc, centers, weighted_average); // pick some alphas.
}
//------------------------------------------------------------------------------
// Macroblock analysis: collect histogram for each mode, deduce the maximal
// susceptibility and set best modes for this macroblock.
// Segment assignment is done later.
// Number of modes to inspect for alpha_ evaluation. We don't need to test all
// the possible modes during the analysis phase: we risk falling into a local
// optimum, or be subject to boundary effect
#define MAX_INTRA16_MODE 2
#define MAX_INTRA4_MODE 2
#define MAX_UV_MODE 2
static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
const int max_mode = MAX_INTRA16_MODE;
int mode;
int best_alpha = DEFAULT_ALPHA;
int best_mode = 0;
VP8MakeLuma16Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
VP8Histogram histo;
int alpha;
InitHistogram(&histo);
VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
it->yuv_p_ + VP8I16ModeOffsets[mode],
0, 16, &histo);
alpha = GetAlpha(&histo);
if (IS_BETTER_ALPHA(alpha, best_alpha)) {
best_alpha = alpha;
best_mode = mode;
}
}
VP8SetIntra16Mode(it, best_mode);
return best_alpha;
}
static int FastMBAnalyze(VP8EncIterator* const it) {
// Empirical cut-off value, should be around 16 (~=block size). We use the
// [8-17] range and favor intra4 at high quality, intra16 for low quality.
const int q = (int)it->enc_->config_->quality;
const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
int k;
uint32_t dc[16], m, m2;
for (k = 0; k < 16; k += 4) {
VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
}
for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
m += dc[k];
m2 += dc[k] * dc[k];
}
if (kThreshold * m2 < m * m) {
VP8SetIntra16Mode(it, 0); // DC16
} else {
const uint8_t modes[16] = { 0 }; // DC4
VP8SetIntra4Mode(it, modes);
}
return 0;
}
static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
int best_alpha = DEFAULT_ALPHA;
int smallest_alpha = 0;
int best_mode = 0;
const int max_mode = MAX_UV_MODE;
int mode;
VP8MakeChroma8Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
VP8Histogram histo;
int alpha;
InitHistogram(&histo);
VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
it->yuv_p_ + VP8UVModeOffsets[mode],
16, 16 + 4 + 4, &histo);
alpha = GetAlpha(&histo);
if (IS_BETTER_ALPHA(alpha, best_alpha)) {
best_alpha = alpha;
}
// The best prediction mode tends to be the one with the smallest alpha.
if (mode == 0 || alpha < smallest_alpha) {
smallest_alpha = alpha;
best_mode = mode;
}
}
VP8SetIntraUVMode(it, best_mode);
return best_alpha;
}
static void MBAnalyze(VP8EncIterator* const it,
int alphas[MAX_ALPHA + 1],
int* const alpha, int* const uv_alpha) {
const VP8Encoder* const enc = it->enc_;
int best_alpha, best_uv_alpha;
VP8SetIntra16Mode(it, 0); // default: Intra16, DC_PRED
VP8SetSkip(it, 0); // not skipped
VP8SetSegment(it, 0); // default segment, spec-wise.
if (enc->method_ <= 1) {
best_alpha = FastMBAnalyze(it);
} else {
best_alpha = MBAnalyzeBestIntra16Mode(it);
}
best_uv_alpha = MBAnalyzeBestUVMode(it);
// Final susceptibility mix
best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
best_alpha = FinalAlphaValue(best_alpha);
alphas[best_alpha]++;
it->mb_->alpha_ = best_alpha; // for later remapping.
// Accumulate for later complexity analysis.
*alpha += best_alpha; // mixed susceptibility (not just luma)
*uv_alpha += best_uv_alpha;
}
static void DefaultMBInfo(VP8MBInfo* const mb) {
mb->type_ = 1; // I16x16
mb->uv_mode_ = 0;
mb->skip_ = 0; // not skipped
mb->segment_ = 0; // default segment
mb->alpha_ = 0;
}
//------------------------------------------------------------------------------
// Main analysis loop:
// Collect all susceptibilities for each macroblock and record their
// distribution in alphas[]. Segments is assigned a-posteriori, based on
// this histogram.
// We also pick an intra16 prediction mode, which shouldn't be considered
// final except for fast-encode settings. We can also pick some intra4 modes
// and decide intra4/intra16, but that's usually almost always a bad choice at
// this stage.
static void ResetAllMBInfo(VP8Encoder* const enc) {
int n;
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
DefaultMBInfo(&enc->mb_info_[n]);
}
// Default susceptibilities.
enc->dqm_[0].alpha_ = 0;
enc->dqm_[0].beta_ = 0;
// Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
enc->alpha_ = 0;
enc->uv_alpha_ = 0;
WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
}
// struct used to collect job result
typedef struct {
WebPWorker worker;
int alphas[MAX_ALPHA + 1];
int alpha, uv_alpha;
VP8EncIterator it;
int delta_progress;
} SegmentJob;
// main work call
static int DoSegmentsJob(void* arg1, void* arg2) {
SegmentJob* const job = (SegmentJob*)arg1;
VP8EncIterator* const it = (VP8EncIterator*)arg2;
int ok = 1;
if (!VP8IteratorIsDone(it)) {
uint8_t tmp[32 + WEBP_ALIGN_CST];
uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
do {
// Let's pretend we have perfect lossless reconstruction.
VP8IteratorImport(it, scratch);
MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
ok = VP8IteratorProgress(it, job->delta_progress);
} while (ok && VP8IteratorNext(it));
}
return ok;
}
static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
int i;
for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
dst->alpha += src->alpha;
dst->uv_alpha += src->uv_alpha;
}
// initialize the job struct with some tasks to perform
static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
int start_row, int end_row) {
WebPGetWorkerInterface()->Init(&job->worker);
job->worker.data1 = job;
job->worker.data2 = &job->it;
job->worker.hook = DoSegmentsJob;
VP8IteratorInit(enc, &job->it);
VP8IteratorSetRow(&job->it, start_row);
VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
memset(job->alphas, 0, sizeof(job->alphas));
job->alpha = 0;
job->uv_alpha = 0;
// only one of both jobs can record the progress, since we don't
// expect the user's hook to be multi-thread safe
job->delta_progress = (start_row == 0) ? 20 : 0;
}
// main entry point
int VP8EncAnalyze(VP8Encoder* const enc) {
int ok = 1;
const int do_segments =
enc->config_->emulate_jpeg_size || // We need the complexity evaluation.
(enc->segment_hdr_.num_segments_ > 1) ||
(enc->method_ <= 1); // for method 0 - 1, we need preds_[] to be filled.
if (do_segments) {
const int last_row = enc->mb_h_;
// We give a little more than a half work to the main thread.
const int split_row = (9 * last_row + 15) >> 4;
const int total_mb = last_row * enc->mb_w_;
#ifdef WEBP_USE_THREAD
const int kMinSplitRow = 2; // minimal rows needed for mt to be worth it
const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
#else
const int do_mt = 0;
#endif
const WebPWorkerInterface* const worker_interface =
WebPGetWorkerInterface();
SegmentJob main_job;
if (do_mt) {
SegmentJob side_job;
// Note the use of '&' instead of '&&' because we must call the functions
// no matter what.
InitSegmentJob(enc, &main_job, 0, split_row);
InitSegmentJob(enc, &side_job, split_row, last_row);
// we don't need to call Reset() on main_job.worker, since we're calling
// WebPWorkerExecute() on it
ok &= worker_interface->Reset(&side_job.worker);
// launch the two jobs in parallel
if (ok) {
worker_interface->Launch(&side_job.worker);
worker_interface->Execute(&main_job.worker);
ok &= worker_interface->Sync(&side_job.worker);
ok &= worker_interface->Sync(&main_job.worker);
}
worker_interface->End(&side_job.worker);
if (ok) MergeJobs(&side_job, &main_job); // merge results together
} else {
// Even for single-thread case, we use the generic Worker tools.
InitSegmentJob(enc, &main_job, 0, last_row);
worker_interface->Execute(&main_job.worker);
ok &= worker_interface->Sync(&main_job.worker);
}
worker_interface->End(&main_job.worker);
if (ok) {
enc->alpha_ = main_job.alpha / total_mb;
enc->uv_alpha_ = main_job.uv_alpha / total_mb;
AssignSegments(enc, main_job.alphas);
}
} else { // Use only one default segment.
ResetAllMBInfo(enc);
}
return ok;
}

View File

@ -0,0 +1,790 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Improves a given set of backward references by analyzing its bit cost.
// The algorithm is similar to the Zopfli compression algorithm but tailored to
// images.
//
// Author: Vincent Rabaud (vrabaud@google.com)
//
#include <assert.h>
#include "src/enc/backward_references_enc.h"
#include "src/enc/histogram_enc.h"
#include "src/dsp/lossless_common.h"
#include "src/utils/color_cache_utils.h"
#include "src/utils/utils.h"
#define VALUES_IN_BYTE 256
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v);
typedef struct {
double alpha_[VALUES_IN_BYTE];
double red_[VALUES_IN_BYTE];
double blue_[VALUES_IN_BYTE];
double distance_[NUM_DISTANCE_CODES];
double* literal_;
} CostModel;
static void ConvertPopulationCountTableToBitEstimates(
int num_symbols, const uint32_t population_counts[], double output[]) {
uint32_t sum = 0;
int nonzeros = 0;
int i;
for (i = 0; i < num_symbols; ++i) {
sum += population_counts[i];
if (population_counts[i] > 0) {
++nonzeros;
}
}
if (nonzeros <= 1) {
memset(output, 0, num_symbols * sizeof(*output));
} else {
const double logsum = VP8LFastLog2(sum);
for (i = 0; i < num_symbols; ++i) {
output[i] = logsum - VP8LFastLog2(population_counts[i]);
}
}
}
static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
const VP8LBackwardRefs* const refs) {
int ok = 0;
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
if (histo == NULL) goto Error;
// The following code is similar to VP8LHistogramCreate but converts the
// distance to plane code.
VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
while (VP8LRefsCursorOk(&c)) {
VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
xsize);
VP8LRefsCursorNext(&c);
}
ConvertPopulationCountTableToBitEstimates(
VP8LHistogramNumCodes(histo->palette_code_bits_),
histo->literal_, m->literal_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->red_, m->red_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->blue_, m->blue_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->alpha_, m->alpha_);
ConvertPopulationCountTableToBitEstimates(
NUM_DISTANCE_CODES, histo->distance_, m->distance_);
ok = 1;
Error:
VP8LFreeHistogram(histo);
return ok;
}
static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
return m->alpha_[v >> 24] +
m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] +
m->blue_[v & 0xff];
}
static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
return m->literal_[literal_idx];
}
static WEBP_INLINE double GetLengthCost(const CostModel* const m,
uint32_t length) {
int code, extra_bits;
VP8LPrefixEncodeBits(length, &code, &extra_bits);
return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
}
static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
uint32_t distance) {
int code, extra_bits;
VP8LPrefixEncodeBits(distance, &code, &extra_bits);
return m->distance_[code] + extra_bits;
}
static WEBP_INLINE void AddSingleLiteralWithCostModel(
const uint32_t* const argb, VP8LColorCache* const hashers,
const CostModel* const cost_model, int idx, int use_color_cache,
float prev_cost, float* const cost, uint16_t* const dist_array) {
double cost_val = prev_cost;
const uint32_t color = argb[idx];
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
if (ix >= 0) {
// use_color_cache is true and hashers contains color
const double mul0 = 0.68;
cost_val += GetCacheCost(cost_model, ix) * mul0;
} else {
const double mul1 = 0.82;
if (use_color_cache) VP8LColorCacheInsert(hashers, color);
cost_val += GetLiteralCost(cost_model, color) * mul1;
}
if (cost[idx] > cost_val) {
cost[idx] = (float)cost_val;
dist_array[idx] = 1; // only one is inserted.
}
}
// -----------------------------------------------------------------------------
// CostManager and interval handling
// Empirical value to avoid high memory consumption but good for performance.
#define COST_CACHE_INTERVAL_SIZE_MAX 500
// To perform backward reference every pixel at index index_ is considered and
// the cost for the MAX_LENGTH following pixels computed. Those following pixels
// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
// cost_ = distance cost at index + GetLengthCost(cost_model, k)
// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
// array of size MAX_LENGTH.
// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
// minimal values using intervals of constant cost.
// An interval is defined by the index_ of the pixel that generated it and
// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
// it contains the minimum value for pixels between start_ and end_.
// Intervals are stored in a linked list and ordered by start_. When a new
// interval has a better value, old intervals are split or removed. There are
// therefore no overlapping intervals.
typedef struct CostInterval CostInterval;
struct CostInterval {
float cost_;
int start_;
int end_;
int index_;
CostInterval* previous_;
CostInterval* next_;
};
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
typedef struct {
double cost_;
int start_;
int end_; // Exclusive.
} CostCacheInterval;
// This structure is in charge of managing intervals and costs.
// It caches the different CostCacheInterval, caches the different
// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
#define COST_MANAGER_MAX_FREE_LIST 10
typedef struct {
CostInterval* head_;
int count_; // The number of stored intervals.
CostCacheInterval* cache_intervals_;
size_t cache_intervals_size_;
double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
float* costs_;
uint16_t* dist_array_;
// Most of the time, we only need few intervals -> use a free-list, to avoid
// fragmentation with small allocs in most common cases.
CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
CostInterval* free_intervals_;
// These are regularly malloc'd remains. This list can't grow larger than than
// size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
CostInterval* recycled_intervals_;
} CostManager;
static void CostIntervalAddToFreeList(CostManager* const manager,
CostInterval* const interval) {
interval->next_ = manager->free_intervals_;
manager->free_intervals_ = interval;
}
static int CostIntervalIsInFreeList(const CostManager* const manager,
const CostInterval* const interval) {
return (interval >= &manager->intervals_[0] &&
interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
}
static void CostManagerInitFreeList(CostManager* const manager) {
int i;
manager->free_intervals_ = NULL;
for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
}
}
static void DeleteIntervalList(CostManager* const manager,
const CostInterval* interval) {
while (interval != NULL) {
const CostInterval* const next = interval->next_;
if (!CostIntervalIsInFreeList(manager, interval)) {
WebPSafeFree((void*)interval);
} // else: do nothing
interval = next;
}
}
static void CostManagerClear(CostManager* const manager) {
if (manager == NULL) return;
WebPSafeFree(manager->costs_);
WebPSafeFree(manager->cache_intervals_);
// Clear the interval lists.
DeleteIntervalList(manager, manager->head_);
manager->head_ = NULL;
DeleteIntervalList(manager, manager->recycled_intervals_);
manager->recycled_intervals_ = NULL;
// Reset pointers, count_ and cache_intervals_size_.
memset(manager, 0, sizeof(*manager));
CostManagerInitFreeList(manager);
}
static int CostManagerInit(CostManager* const manager,
uint16_t* const dist_array, int pix_count,
const CostModel* const cost_model) {
int i;
const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
manager->costs_ = NULL;
manager->cache_intervals_ = NULL;
manager->head_ = NULL;
manager->recycled_intervals_ = NULL;
manager->count_ = 0;
manager->dist_array_ = dist_array;
CostManagerInitFreeList(manager);
// Fill in the cost_cache_.
manager->cache_intervals_size_ = 1;
manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
for (i = 1; i < cost_cache_size; ++i) {
manager->cost_cache_[i] = GetLengthCost(cost_model, i);
// Get the number of bound intervals.
if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
++manager->cache_intervals_size_;
}
}
// With the current cost model, we usually have below 20 intervals.
// The worst case scenario with a cost model would be if every length has a
// different cost, hence MAX_LENGTH but that is impossible with the current
// implementation that spirals around a pixel.
assert(manager->cache_intervals_size_ <= MAX_LENGTH);
manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
if (manager->cache_intervals_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Fill in the cache_intervals_.
{
CostCacheInterval* cur = manager->cache_intervals_;
// Consecutive values in cost_cache_ are compared and if a big enough
// difference is found, a new interval is created and bounded.
cur->start_ = 0;
cur->end_ = 1;
cur->cost_ = manager->cost_cache_[0];
for (i = 1; i < cost_cache_size; ++i) {
const double cost_val = manager->cost_cache_[i];
if (cost_val != cur->cost_) {
++cur;
// Initialize an interval.
cur->start_ = i;
cur->cost_ = cost_val;
}
cur->end_ = i + 1;
}
}
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
if (manager->costs_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Set the initial costs_ high for every pixel as we will keep the minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
return 1;
}
// Given the cost and the position that define an interval, update the cost at
// pixel 'i' if it is smaller than the previously computed value.
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
int position, float cost) {
const int k = i - position;
assert(k >= 0 && k < MAX_LENGTH);
if (manager->costs_[i] > cost) {
manager->costs_[i] = cost;
manager->dist_array_[i] = k + 1;
}
}
// Given the cost and the position that define an interval, update the cost for
// all the pixels between 'start' and 'end' excluded.
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
int start, int end, int position,
float cost) {
int i;
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
}
// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
CostInterval* const prev,
CostInterval* const next) {
if (prev != NULL) {
prev->next_ = next;
} else {
manager->head_ = next;
}
if (next != NULL) next->previous_ = prev;
}
// Pop an interval in the manager.
static WEBP_INLINE void PopInterval(CostManager* const manager,
CostInterval* const interval) {
if (interval == NULL) return;
ConnectIntervals(manager, interval->previous_, interval->next_);
if (CostIntervalIsInFreeList(manager, interval)) {
CostIntervalAddToFreeList(manager, interval);
} else { // recycle regularly malloc'd intervals too
interval->next_ = manager->recycled_intervals_;
manager->recycled_intervals_ = interval;
}
--manager->count_;
assert(manager->count_ >= 0);
}
// Update the cost at index i by going over all the stored intervals that
// overlap with i.
// If 'do_clean_intervals' is set to something different than 0, intervals that
// end before 'i' will be popped.
static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
int do_clean_intervals) {
CostInterval* current = manager->head_;
while (current != NULL && current->start_ <= i) {
CostInterval* const next = current->next_;
if (current->end_ <= i) {
if (do_clean_intervals) {
// We have an outdated interval, remove it.
PopInterval(manager, current);
}
} else {
UpdateCost(manager, i, current->index_, current->cost_);
}
current = next;
}
}
// Given a current orphan interval and its previous interval, before
// it was orphaned (which can be NULL), set it at the right place in the list
// of intervals using the start_ ordering and the previous interval as a hint.
static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
CostInterval* const current,
CostInterval* previous) {
assert(current != NULL);
if (previous == NULL) previous = manager->head_;
while (previous != NULL && current->start_ < previous->start_) {
previous = previous->previous_;
}
while (previous != NULL && previous->next_ != NULL &&
previous->next_->start_ < current->start_) {
previous = previous->next_;
}
if (previous != NULL) {
ConnectIntervals(manager, current, previous->next_);
} else {
ConnectIntervals(manager, current, manager->head_);
}
ConnectIntervals(manager, previous, current);
}
// Insert an interval in the list contained in the manager by starting at
// interval_in as a hint. The intervals are sorted by start_ value.
static WEBP_INLINE void InsertInterval(CostManager* const manager,
CostInterval* const interval_in,
float cost, int position, int start,
int end) {
CostInterval* interval_new;
if (start >= end) return;
if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
// Serialize the interval if we cannot store it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
if (manager->free_intervals_ != NULL) {
interval_new = manager->free_intervals_;
manager->free_intervals_ = interval_new->next_;
} else if (manager->recycled_intervals_ != NULL) {
interval_new = manager->recycled_intervals_;
manager->recycled_intervals_ = interval_new->next_;
} else { // malloc for good
interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
if (interval_new == NULL) {
// Write down the interval if we cannot create it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
}
interval_new->cost_ = cost;
interval_new->index_ = position;
interval_new->start_ = start;
interval_new->end_ = end;
PositionOrphanInterval(manager, interval_new, interval_in);
++manager->count_;
}
// Given a new cost interval defined by its start at position, its length value
// and distance_cost, add its contributions to the previous intervals and costs.
// If handling the interval or one of its subintervals becomes to heavy, its
// contribution is added to the costs right away.
static WEBP_INLINE void PushInterval(CostManager* const manager,
double distance_cost, int position,
int len) {
size_t i;
CostInterval* interval = manager->head_;
CostInterval* interval_next;
const CostCacheInterval* const cost_cache_intervals =
manager->cache_intervals_;
// If the interval is small enough, no need to deal with the heavy
// interval logic, just serialize it right away. This constant is empirical.
const int kSkipDistance = 10;
if (len < kSkipDistance) {
int j;
for (j = position; j < position + len; ++j) {
const int k = j - position;
float cost_tmp;
assert(k >= 0 && k < MAX_LENGTH);
cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
if (manager->costs_[j] > cost_tmp) {
manager->costs_[j] = cost_tmp;
manager->dist_array_[j] = k + 1;
}
}
return;
}
for (i = 0; i < manager->cache_intervals_size_ &&
cost_cache_intervals[i].start_ < len;
++i) {
// Define the intersection of the ith interval with the new one.
int start = position + cost_cache_intervals[i].start_;
const int end = position + (cost_cache_intervals[i].end_ > len
? len
: cost_cache_intervals[i].end_);
const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
for (; interval != NULL && interval->start_ < end;
interval = interval_next) {
interval_next = interval->next_;
// Make sure we have some overlap
if (start >= interval->end_) continue;
if (cost >= interval->cost_) {
// When intervals are represented, the lower, the better.
// [**********************************************************[
// start end
// [----------------------------------[
// interval->start_ interval->end_
// If we are worse than what we already have, add whatever we have so
// far up to interval.
const int start_new = interval->end_;
InsertInterval(manager, interval, cost, position, start,
interval->start_);
start = start_new;
if (start >= end) break;
continue;
}
if (start <= interval->start_) {
if (interval->end_ <= end) {
// [----------------------------------[
// interval->start_ interval->end_
// [**************************************************************[
// start end
// We can safely remove the old interval as it is fully included.
PopInterval(manager, interval);
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->start_ = end;
break;
}
} else {
if (end < interval->end_) {
// [--------------------------------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
// We have to split the old interval as it fully contains the new one.
const int end_original = interval->end_;
interval->end_ = start;
InsertInterval(manager, interval, interval->cost_, interval->index_,
end, end_original);
interval = interval->next_;
break;
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->end_ = start;
}
}
}
// Insert the remaining interval from start to end.
InsertInterval(manager, interval, cost, position, start, end);
}
}
static int BackwardReferencesHashChainDistanceOnly(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
uint16_t* const dist_array) {
int i;
int ok = 0;
int cc_init = 0;
const int pix_count = xsize * ysize;
const int use_color_cache = (cache_bits > 0);
const size_t literal_array_size =
sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
((cache_bits > 0) ? (1 << cache_bits) : 0));
const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
CostModel* const cost_model =
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
VP8LColorCache hashers;
CostManager* cost_manager =
(CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
int offset_prev = -1, len_prev = -1;
double offset_cost = -1;
int first_offset_is_constant = -1; // initialized with 'impossible' value
int reach = 0;
if (cost_model == NULL || cost_manager == NULL) goto Error;
cost_model->literal_ = (double*)(cost_model + 1);
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
goto Error;
}
if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
goto Error;
}
// We loop one pixel at a time, but store all currently best points to
// non-processed locations from this point.
dist_array[0] = 0;
// Add first pixel as literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
0.f, cost_manager->costs_, dist_array);
for (i = 1; i < pix_count; ++i) {
const float prev_cost = cost_manager->costs_[i - 1];
int offset, len;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
// Try adding the pixel as a literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
use_color_cache, prev_cost,
cost_manager->costs_, dist_array);
// If we are dealing with a non-literal.
if (len >= 2) {
if (offset != offset_prev) {
const int code = VP8LDistanceToPlaneCode(xsize, offset);
offset_cost = GetDistanceCost(cost_model, code);
first_offset_is_constant = 1;
PushInterval(cost_manager, prev_cost + offset_cost, i, len);
} else {
assert(offset_cost >= 0);
assert(len_prev >= 0);
assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
// Instead of considering all contributions from a pixel i by calling:
// PushInterval(cost_manager, prev_cost + offset_cost, i, len);
// we optimize these contributions in case offset_cost stays the same
// for consecutive pixels. This describes a set of pixels similar to a
// previous set (e.g. constant color regions).
if (first_offset_is_constant) {
reach = i - 1 + len_prev - 1;
first_offset_is_constant = 0;
}
if (i + len - 1 > reach) {
// We can only be go further with the same offset if the previous
// length was maxed, hence len_prev == len == MAX_LENGTH.
// TODO(vrabaud), bump i to the end right away (insert cache and
// update cost).
// TODO(vrabaud), check if one of the points in between does not have
// a lower cost.
// Already consider the pixel at "reach" to add intervals that are
// better than whatever we add.
int offset_j, len_j = 0;
int j;
assert(len == MAX_LENGTH || len == pix_count - i);
// Figure out the last consecutive pixel within [i, reach + 1] with
// the same offset.
for (j = i; j <= reach; ++j) {
VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
if (offset_j != offset) {
VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
break;
}
}
// Update the cost at j - 1 and j.
UpdateCostAtIndex(cost_manager, j - 1, 0);
UpdateCostAtIndex(cost_manager, j, 0);
PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
j, len_j);
reach = j + len_j - 1;
}
}
}
UpdateCostAtIndex(cost_manager, i, 1);
offset_prev = offset;
len_prev = len;
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
CostManagerClear(cost_manager);
WebPSafeFree(cost_model);
WebPSafeFree(cost_manager);
return ok;
}
// We pack the path at the end of *dist_array and return
// a pointer to this part of the array. Example:
// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
static void TraceBackwards(uint16_t* const dist_array,
int dist_array_size,
uint16_t** const chosen_path,
int* const chosen_path_size) {
uint16_t* path = dist_array + dist_array_size;
uint16_t* cur = dist_array + dist_array_size - 1;
while (cur >= dist_array) {
const int k = *cur;
--path;
*path = k;
cur -= k;
}
*chosen_path = path;
*chosen_path_size = (int)(dist_array + dist_array_size - path);
}
static int BackwardReferencesHashChainFollowChosenPath(
const uint32_t* const argb, int cache_bits,
const uint16_t* const chosen_path, int chosen_path_size,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
const int use_color_cache = (cache_bits > 0);
int ix;
int i = 0;
int ok = 0;
int cc_init = 0;
VP8LColorCache hashers;
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
VP8LClearBackwardRefs(refs);
for (ix = 0; ix < chosen_path_size; ++ix) {
const int len = chosen_path[ix];
if (len != 1) {
int k;
const int offset = VP8LHashChainFindOffset(hash_chain, i);
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
if (use_color_cache) {
for (k = 0; k < len; ++k) {
VP8LColorCacheInsert(&hashers, argb[i + k]);
}
}
i += len;
} else {
PixOrCopy v;
const int idx =
use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
if (idx >= 0) {
// use_color_cache is true and hashers contains argb[i]
// push pixel as a color cache index
v = PixOrCopyCreateCacheIdx(idx);
} else {
if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
v = PixOrCopyCreateLiteral(argb[i]);
}
VP8LBackwardRefsCursorAdd(refs, v);
++i;
}
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
return ok;
}
// Returns 1 on success.
extern int VP8LBackwardReferencesTraceBackwards(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
const uint32_t* const argb,
int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src,
VP8LBackwardRefs* const refs_dst) {
int ok = 0;
const int dist_array_size = xsize * ysize;
uint16_t* chosen_path = NULL;
int chosen_path_size = 0;
uint16_t* dist_array =
(uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
if (dist_array == NULL) goto Error;
if (!BackwardReferencesHashChainDistanceOnly(
xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
goto Error;
}
TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
if (!BackwardReferencesHashChainFollowChosenPath(
argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
refs_dst)) {
goto Error;
}
ok = 1;
Error:
WebPSafeFree(dist_array);
return ok;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,157 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Coding tools configuration
//
// Author: Skal (pascal.massimino@gmail.com)
#ifdef HAVE_CONFIG_H
#include "src/webp/config.h"
#endif
#include "src/webp/encode.h"
//------------------------------------------------------------------------------
// WebPConfig
//------------------------------------------------------------------------------
int WebPConfigInitInternal(WebPConfig* config,
WebPPreset preset, float quality, int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
return 0; // caller/system version mismatch!
}
if (config == NULL) return 0;
config->quality = quality;
config->target_size = 0;
config->target_PSNR = 0.;
config->method = 4;
config->sns_strength = 50;
config->filter_strength = 60; // mid-filtering
config->filter_sharpness = 0;
config->filter_type = 1; // default: strong (so U/V is filtered too)
config->partitions = 0;
config->segments = 4;
config->pass = 1;
config->qmin = 0;
config->qmax = 100;
config->show_compressed = 0;
config->preprocessing = 0;
config->autofilter = 0;
config->partition_limit = 0;
config->alpha_compression = 1;
config->alpha_filtering = 1;
config->alpha_quality = 100;
config->lossless = 0;
config->exact = 0;
config->image_hint = WEBP_HINT_DEFAULT;
config->emulate_jpeg_size = 0;
config->thread_level = 0;
config->low_memory = 0;
config->near_lossless = 100;
config->use_delta_palette = 0;
config->use_sharp_yuv = 0;
// TODO(skal): tune.
switch (preset) {
case WEBP_PRESET_PICTURE:
config->sns_strength = 80;
config->filter_sharpness = 4;
config->filter_strength = 35;
config->preprocessing &= ~2; // no dithering
break;
case WEBP_PRESET_PHOTO:
config->sns_strength = 80;
config->filter_sharpness = 3;
config->filter_strength = 30;
config->preprocessing |= 2;
break;
case WEBP_PRESET_DRAWING:
config->sns_strength = 25;
config->filter_sharpness = 6;
config->filter_strength = 10;
break;
case WEBP_PRESET_ICON:
config->sns_strength = 0;
config->filter_strength = 0; // disable filtering to retain sharpness
config->preprocessing &= ~2; // no dithering
break;
case WEBP_PRESET_TEXT:
config->sns_strength = 0;
config->filter_strength = 0; // disable filtering to retain sharpness
config->preprocessing &= ~2; // no dithering
config->segments = 2;
break;
case WEBP_PRESET_DEFAULT:
default:
break;
}
return WebPValidateConfig(config);
}
int WebPValidateConfig(const WebPConfig* config) {
if (config == NULL) return 0;
if (config->quality < 0 || config->quality > 100) return 0;
if (config->target_size < 0) return 0;
if (config->target_PSNR < 0) return 0;
if (config->method < 0 || config->method > 6) return 0;
if (config->segments < 1 || config->segments > 4) return 0;
if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
if (config->filter_type < 0 || config->filter_type > 1) return 0;
if (config->autofilter < 0 || config->autofilter > 1) return 0;
if (config->pass < 1 || config->pass > 10) return 0;
if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
return 0;
}
if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
if (config->partitions < 0 || config->partitions > 3) return 0;
if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
if (config->alpha_compression < 0) return 0;
if (config->alpha_filtering < 0) return 0;
if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
if (config->lossless < 0 || config->lossless > 1) return 0;
if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
if (config->image_hint >= WEBP_HINT_LAST) return 0;
if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
if (config->thread_level < 0 || config->thread_level > 1) return 0;
if (config->low_memory < 0 || config->low_memory > 1) return 0;
if (config->exact < 0 || config->exact > 1) return 0;
if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
return 0;
}
if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
return 1;
}
//------------------------------------------------------------------------------
#define MAX_LEVEL 9
// Mapping between -z level and -m / -q parameter settings.
static const struct {
uint8_t method_;
uint8_t quality_;
} kLosslessPresets[MAX_LEVEL + 1] = {
{ 0, 0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
{ 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
};
int WebPConfigLosslessPreset(WebPConfig* config, int level) {
if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
config->lossless = 1;
config->method = kLosslessPresets[level].method_;
config->quality = kLosslessPresets[level].quality_;
return 1;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,342 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Cost tables for level and modes
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Level cost tables
// For each given level, the following table gives the pattern of contexts to
// use for coding it (in [][0]) as well as the bit value to use for each
// context (in [][1]).
const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
{0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
{0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
{0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
{0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013},
{0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x093},
{0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
{0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
{0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
{0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
{0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
};
static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
int pattern = VP8LevelCodes[level - 1][0];
int bits = VP8LevelCodes[level - 1][1];
int cost = 0;
int i;
for (i = 2; pattern; ++i) {
if (pattern & 1) {
cost += VP8BitCost(bits & 1, probas[i]);
}
bits >>= 1;
pattern >>= 1;
}
return cost;
}
//------------------------------------------------------------------------------
// Pre-calc level costs once for all
void VP8CalculateLevelCosts(VP8EncProba* const proba) {
int ctype, band, ctx;
if (!proba->dirty_) return; // nothing to do.
for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
int n;
for (band = 0; band < NUM_BANDS; ++band) {
for (ctx = 0; ctx < NUM_CTX; ++ctx) {
const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
uint16_t* const table = proba->level_cost_[ctype][band][ctx];
const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
const int cost_base = VP8BitCost(1, p[1]) + cost0;
int v;
table[0] = VP8BitCost(0, p[1]) + cost0;
for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
table[v] = cost_base + VariableLevelCost(v, p);
}
// Starting at level 67 and up, the variable part of the cost is
// actually constant.
}
}
for (n = 0; n < 16; ++n) { // replicate bands. We don't need to sentinel.
for (ctx = 0; ctx < NUM_CTX; ++ctx) {
proba->remapped_costs_[ctype][n][ctx] =
proba->level_cost_[ctype][VP8EncBands[n]][ctx];
}
}
}
proba->dirty_ = 0;
}
//------------------------------------------------------------------------------
// Mode cost tables.
// These are the fixed probabilities (in the coding trees) turned into bit-cost
// by calling VP8BitCost().
const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
{ { 40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
{ 192, 469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
{ 142, 910, 762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
{ 559, 641, 1370, 421, 1182, 1569, 1612, 1725, 863, 1007 },
{ 299, 1059, 1256, 1108, 636, 1068, 1581, 1883, 869, 1142 },
{ 277, 1111, 707, 1362, 1089, 672, 1603, 1541, 1545, 1291 },
{ 214, 781, 1609, 1303, 1632, 2229, 726, 1560, 1713, 918 },
{ 152, 1037, 1046, 1759, 1983, 2174, 1358, 742, 1740, 1390 },
{ 512, 1046, 1420, 753, 752, 1297, 1486, 1613, 460, 1207 },
{ 424, 827, 1362, 719, 1462, 1202, 1199, 1476, 1199, 538 } },
{ { 240, 402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
{ 467, 242, 960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
{ 500, 455, 463, 1507, 1699, 1282, 1564, 982, 2114, 2114 },
{ 672, 643, 1372, 331, 1589, 1667, 1453, 1938, 996, 876 },
{ 458, 783, 1037, 911, 738, 968, 1165, 1518, 859, 1033 },
{ 504, 815, 504, 1139, 1219, 719, 1506, 1085, 1268, 1268 },
{ 333, 630, 1445, 1239, 1883, 3672, 799, 1548, 1865, 598 },
{ 399, 644, 746, 1342, 1856, 1350, 1493, 613, 1855, 1015 },
{ 622, 749, 1205, 608, 1066, 1408, 1290, 1406, 546, 971 },
{ 500, 753, 1041, 668, 1230, 1617, 1297, 1425, 1383, 523 } },
{ { 394, 553, 523, 1502, 1536, 981, 1608, 1142, 1666, 2181 },
{ 655, 430, 375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
{ 690, 640, 245, 1954, 2070, 1194, 1528, 982, 1972, 2232 },
{ 559, 834, 741, 867, 1131, 980, 1225, 852, 1092, 784 },
{ 690, 875, 516, 959, 673, 894, 1056, 1190, 1528, 1126 },
{ 740, 951, 384, 1277, 1177, 492, 1579, 1155, 1846, 1513 },
{ 323, 775, 1062, 1776, 3062, 1274, 813, 1188, 1372, 655 },
{ 488, 971, 484, 1767, 1515, 1775, 1115, 503, 1539, 1461 },
{ 740, 1006, 998, 709, 851, 1230, 1337, 788, 741, 721 },
{ 522, 1073, 573, 1045, 1346, 887, 1046, 1146, 1203, 697 } },
{ { 105, 864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
{ 534, 305, 1193, 683, 1388, 2164, 1802, 1894, 1264, 1170 },
{ 305, 518, 877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
{ 683, 732, 1927, 257, 1493, 2048, 1858, 1552, 1055, 947 },
{ 394, 814, 1024, 660, 959, 1556, 1282, 1289, 893, 1047 },
{ 528, 615, 996, 940, 1201, 635, 1094, 2515, 803, 1358 },
{ 347, 614, 1609, 1187, 3133, 1345, 1007, 1339, 1017, 667 },
{ 218, 740, 878, 1605, 3650, 3650, 1345, 758, 1357, 1617 },
{ 672, 750, 1541, 558, 1257, 1599, 1870, 2135, 402, 1087 },
{ 592, 684, 1161, 430, 1092, 1497, 1475, 1489, 1095, 822 } },
{ { 228, 1056, 1059, 1368, 752, 982, 1512, 1518, 987, 1782 },
{ 494, 514, 818, 942, 965, 892, 1610, 1356, 1048, 1363 },
{ 512, 648, 591, 1042, 761, 991, 1196, 1454, 1309, 1463 },
{ 683, 749, 1043, 676, 841, 1396, 1133, 1138, 654, 939 },
{ 622, 1101, 1126, 994, 361, 1077, 1203, 1318, 877, 1219 },
{ 631, 1068, 857, 1650, 651, 477, 1650, 1419, 828, 1170 },
{ 555, 727, 1068, 1335, 3127, 1339, 820, 1331, 1077, 429 },
{ 504, 879, 624, 1398, 889, 889, 1392, 808, 891, 1406 },
{ 683, 1602, 1289, 977, 578, 983, 1280, 1708, 406, 1122 },
{ 399, 865, 1433, 1070, 1072, 764, 968, 1477, 1223, 678 } },
{ { 333, 760, 935, 1638, 1010, 529, 1646, 1410, 1472, 2219 },
{ 512, 494, 750, 1160, 1215, 610, 1870, 1868, 1628, 1169 },
{ 572, 646, 492, 1934, 1208, 603, 1580, 1099, 1398, 1995 },
{ 786, 789, 942, 581, 1018, 951, 1599, 1207, 731, 768 },
{ 690, 1015, 672, 1078, 582, 504, 1693, 1438, 1108, 2897 },
{ 768, 1267, 571, 2005, 1243, 244, 2881, 1380, 1786, 1453 },
{ 452, 899, 1293, 903, 1311, 3100, 465, 1311, 1319, 813 },
{ 394, 927, 942, 1103, 1358, 1104, 946, 593, 1363, 1109 },
{ 559, 1005, 1007, 1016, 658, 1173, 1021, 1164, 623, 1028 },
{ 564, 796, 632, 1005, 1014, 863, 2316, 1268, 938, 764 } },
{ { 266, 606, 1098, 1228, 1497, 1243, 948, 1030, 1734, 1461 },
{ 366, 585, 901, 1060, 1407, 1247, 876, 1134, 1620, 1054 },
{ 452, 565, 542, 1729, 1479, 1479, 1016, 886, 2938, 1150 },
{ 555, 1088, 1533, 950, 1354, 895, 834, 1019, 1021, 496 },
{ 704, 815, 1193, 971, 973, 640, 1217, 2214, 832, 578 },
{ 672, 1245, 579, 871, 875, 774, 872, 1273, 1027, 949 },
{ 296, 1134, 2050, 1784, 1636, 3425, 442, 1550, 2076, 722 },
{ 342, 982, 1259, 1846, 1848, 1848, 622, 568, 1847, 1052 },
{ 555, 1064, 1304, 828, 746, 1343, 1075, 1329, 1078, 494 },
{ 288, 1167, 1285, 1174, 1639, 1639, 833, 2254, 1304, 509 } },
{ { 342, 719, 767, 1866, 1757, 1270, 1246, 550, 1746, 2151 },
{ 483, 653, 694, 1509, 1459, 1410, 1218, 507, 1914, 1266 },
{ 488, 757, 447, 2979, 1813, 1268, 1654, 539, 1849, 2109 },
{ 522, 1097, 1085, 851, 1365, 1111, 851, 901, 961, 605 },
{ 709, 716, 841, 728, 736, 945, 941, 862, 2845, 1057 },
{ 512, 1323, 500, 1336, 1083, 681, 1342, 717, 1604, 1350 },
{ 452, 1155, 1372, 1900, 1501, 3290, 311, 944, 1919, 922 },
{ 403, 1520, 977, 2132, 1733, 3522, 1076, 276, 3335, 1547 },
{ 559, 1374, 1101, 615, 673, 2462, 974, 795, 984, 984 },
{ 547, 1122, 1062, 812, 1410, 951, 1140, 622, 1268, 651 } },
{ { 165, 982, 1235, 938, 1334, 1366, 1659, 1578, 964, 1612 },
{ 592, 422, 925, 847, 1139, 1112, 1387, 2036, 861, 1041 },
{ 403, 837, 732, 770, 941, 1658, 1250, 809, 1407, 1407 },
{ 896, 874, 1071, 381, 1568, 1722, 1437, 2192, 480, 1035 },
{ 640, 1098, 1012, 1032, 684, 1382, 1581, 2106, 416, 865 },
{ 559, 1005, 819, 914, 710, 770, 1418, 920, 838, 1435 },
{ 415, 1258, 1245, 870, 1278, 3067, 770, 1021, 1287, 522 },
{ 406, 990, 601, 1009, 1265, 1265, 1267, 759, 1017, 1277 },
{ 968, 1182, 1329, 788, 1032, 1292, 1705, 1714, 203, 1403 },
{ 732, 877, 1279, 471, 901, 1161, 1545, 1294, 755, 755 } },
{ { 111, 931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
{ 406, 414, 1030, 1023, 1910, 1404, 1313, 1647, 1509, 793 },
{ 342, 640, 575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
{ 559, 766, 1185, 357, 1682, 1428, 1329, 1897, 1219, 802 },
{ 473, 909, 1164, 771, 719, 2508, 1427, 1432, 722, 782 },
{ 342, 892, 785, 1145, 1150, 794, 1296, 1550, 973, 1057 },
{ 208, 1036, 1326, 1343, 1606, 3395, 815, 1455, 1618, 712 },
{ 228, 928, 890, 1046, 3499, 1711, 994, 829, 1720, 1318 },
{ 768, 724, 1058, 636, 991, 1075, 1319, 1324, 616, 825 },
{ 305, 1167, 1358, 899, 1587, 1587, 987, 1988, 1332, 501 } }
};
//------------------------------------------------------------------------------
// helper functions for residuals struct VP8Residual.
void VP8InitResidual(int first, int coeff_type,
VP8Encoder* const enc, VP8Residual* const res) {
res->coeff_type = coeff_type;
res->prob = enc->proba_.coeffs_[coeff_type];
res->stats = enc->proba_.stats_[coeff_type];
res->costs = enc->proba_.remapped_costs_[coeff_type];
res->first = first;
}
//------------------------------------------------------------------------------
// Mode costs
int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
VP8Residual res;
VP8Encoder* const enc = it->enc_;
int R = 0;
int ctx;
VP8InitResidual(0, 3, enc, &res);
ctx = it->top_nz_[x] + it->left_nz_[y];
VP8SetResidualCoeffs(levels, &res);
R += VP8GetResidualCost(ctx, &res);
return R;
}
int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
VP8Residual res;
VP8Encoder* const enc = it->enc_;
int x, y;
int R = 0;
VP8IteratorNzToBytes(it); // re-import the non-zero context
// DC
VP8InitResidual(0, 1, enc, &res);
VP8SetResidualCoeffs(rd->y_dc_levels, &res);
R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
// AC
VP8InitResidual(1, 0, enc, &res);
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
const int ctx = it->top_nz_[x] + it->left_nz_[y];
VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
R += VP8GetResidualCost(ctx, &res);
it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
}
}
return R;
}
int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
VP8Residual res;
VP8Encoder* const enc = it->enc_;
int ch, x, y;
int R = 0;
VP8IteratorNzToBytes(it); // re-import the non-zero context
VP8InitResidual(0, 2, enc, &res);
for (ch = 0; ch <= 2; ch += 2) {
for (y = 0; y < 2; ++y) {
for (x = 0; x < 2; ++x) {
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
R += VP8GetResidualCost(ctx, &res);
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
}
}
}
return R;
}
//------------------------------------------------------------------------------
// Recording of token probabilities.
// We keep the table-free variant around for reference, in case.
#define USE_LEVEL_CODE_TABLE
// Simulate block coding, but only record statistics.
// Note: no need to record the fixed probas.
int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
int n = res->first;
// should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
proba_t* s = res->stats[n][ctx];
if (res->last < 0) {
VP8RecordStats(0, s + 0);
return 0;
}
while (n <= res->last) {
int v;
VP8RecordStats(1, s + 0); // order of record doesn't matter
while ((v = res->coeffs[n++]) == 0) {
VP8RecordStats(0, s + 1);
s = res->stats[VP8EncBands[n]][0];
}
VP8RecordStats(1, s + 1);
if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) { // v = -1 or 1
s = res->stats[VP8EncBands[n]][1];
} else {
v = abs(v);
#if !defined(USE_LEVEL_CODE_TABLE)
if (!VP8RecordStats(v > 4, s + 3)) {
if (VP8RecordStats(v != 2, s + 4))
VP8RecordStats(v == 4, s + 5);
} else if (!VP8RecordStats(v > 10, s + 6)) {
VP8RecordStats(v > 6, s + 7);
} else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
} else {
VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
}
#else
if (v > MAX_VARIABLE_LEVEL) {
v = MAX_VARIABLE_LEVEL;
}
{
const int bits = VP8LevelCodes[v - 1][1];
int pattern = VP8LevelCodes[v - 1][0];
int i;
for (i = 0; (pattern >>= 1) != 0; ++i) {
const int mask = 2 << i;
if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
}
}
#endif
s = res->stats[VP8EncBands[n]][2];
}
}
if (n < 16) VP8RecordStats(0, s + 0);
return 1;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,235 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Selecting filter level
//
// Author: somnath@google.com (Somnath Banerjee)
#include <assert.h>
#include "src/enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
// This table gives, for a given sharpness, the filtering strength to be
// used (at least) in order to filter a given edge step delta.
// This is constructed by brute force inspection: for all delta, we iterate
// over all possible filtering strength / thresh until needs_filter() returns
// true.
#define MAX_DELTA_SIZE 64
static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
{ 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18,
20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 19,
20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19,
21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 20,
21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20,
22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 21,
22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
{ 0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21,
23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
};
int VP8FilterStrengthFromDelta(int sharpness, int delta) {
const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
assert(sharpness >= 0 && sharpness <= 7);
return kLevelsFromDelta[sharpness][pos];
}
//------------------------------------------------------------------------------
// Paragraph 15.4: compute the inner-edge filtering strength
#if !defined(WEBP_REDUCE_SIZE)
static int GetILevel(int sharpness, int level) {
if (sharpness > 0) {
if (sharpness > 4) {
level >>= 2;
} else {
level >>= 1;
}
if (level > 9 - sharpness) {
level = 9 - sharpness;
}
}
if (level < 1) level = 1;
return level;
}
static void DoFilter(const VP8EncIterator* const it, int level) {
const VP8Encoder* const enc = it->enc_;
const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
const int limit = 2 * level + ilevel;
uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
// copy current block to yuv_out2_
memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
if (enc->filter_hdr_.simple_ == 1) { // simple
VP8SimpleHFilter16i(y_dst, BPS, limit);
VP8SimpleVFilter16i(y_dst, BPS, limit);
} else { // complex
const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
}
}
//------------------------------------------------------------------------------
// SSIM metric for one macroblock
static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
int x, y;
double sum = 0.;
// compute SSIM in a 10 x 10 window
for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
x, y, 16, 16);
}
}
for (x = 1; x < 7; x++) {
for (y = 1; y < 7; y++) {
sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
x, y, 8, 8);
sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
x, y, 8, 8);
}
}
return sum;
}
#endif // !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
// Exposed APIs: Encoder should call the following 3 functions to adjust
// loop filter strength
void VP8InitFilter(VP8EncIterator* const it) {
#if !defined(WEBP_REDUCE_SIZE)
if (it->lf_stats_ != NULL) {
int s, i;
for (s = 0; s < NUM_MB_SEGMENTS; s++) {
for (i = 0; i < MAX_LF_LEVELS; i++) {
(*it->lf_stats_)[s][i] = 0;
}
}
VP8SSIMDspInit();
}
#else
(void)it;
#endif
}
void VP8StoreFilterStats(VP8EncIterator* const it) {
#if !defined(WEBP_REDUCE_SIZE)
int d;
VP8Encoder* const enc = it->enc_;
const int s = it->mb_->segment_;
const int level0 = enc->dqm_[s].fstrength_;
// explore +/-quant range of values around level0
const int delta_min = -enc->dqm_[s].quant_;
const int delta_max = enc->dqm_[s].quant_;
const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
if (it->lf_stats_ == NULL) return;
// NOTE: Currently we are applying filter only across the sublock edges
// There are two reasons for that.
// 1. Applying filter on macro block edges will change the pixels in
// the left and top macro blocks. That will be hard to restore
// 2. Macro Blocks on the bottom and right are not yet compressed. So we
// cannot apply filter on the right and bottom macro block edges.
if (it->mb_->type_ == 1 && it->mb_->skip_) return;
// Always try filter level zero
(*it->lf_stats_)[s][0] += GetMBSSIM(it->yuv_in_, it->yuv_out_);
for (d = delta_min; d <= delta_max; d += step_size) {
const int level = level0 + d;
if (level <= 0 || level >= MAX_LF_LEVELS) {
continue;
}
DoFilter(it, level);
(*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
}
#else // defined(WEBP_REDUCE_SIZE)
(void)it;
#endif // !defined(WEBP_REDUCE_SIZE)
}
void VP8AdjustFilterStrength(VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
#if !defined(WEBP_REDUCE_SIZE)
if (it->lf_stats_ != NULL) {
int s;
for (s = 0; s < NUM_MB_SEGMENTS; s++) {
int i, best_level = 0;
// Improvement over filter level 0 should be at least 1e-5 (relatively)
double best_v = 1.00001 * (*it->lf_stats_)[s][0];
for (i = 1; i < MAX_LF_LEVELS; i++) {
const double v = (*it->lf_stats_)[s][i];
if (v > best_v) {
best_v = v;
best_level = i;
}
}
enc->dqm_[s].fstrength_ = best_level;
}
return;
}
#endif // !defined(WEBP_REDUCE_SIZE)
if (enc->config_->filter_strength > 0) {
int max_level = 0;
int s;
for (s = 0; s < NUM_MB_SEGMENTS; s++) {
VP8SegmentInfo* const dqm = &enc->dqm_[s];
// this '>> 3' accounts for some inverse WHT scaling
const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
const int level =
VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
if (level > dqm->fstrength_) {
dqm->fstrength_ = level;
}
if (max_level < dqm->fstrength_) {
max_level = dqm->fstrength_;
}
}
enc->filter_hdr_.level_ = max_level;
}
}
// -----------------------------------------------------------------------------

View File

@ -0,0 +1,894 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// frame coding and analysis
//
// Author: Skal (pascal.massimino@gmail.com)
#include <string.h>
#include <math.h>
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/webp/format_constants.h" // RIFF constants
#define SEGMENT_VISU 0
#define DEBUG_SEARCH 0 // useful to track search convergence
//------------------------------------------------------------------------------
// multi-pass convergence
#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + \
VP8_FRAME_HEADER_SIZE)
#define DQ_LIMIT 0.4 // convergence is considered reached if dq < DQ_LIMIT
// we allow 2k of extra head-room in PARTITION0 limit.
#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
static float Clamp(float v, float min, float max) {
return (v < min) ? min : (v > max) ? max : v;
}
typedef struct { // struct for organizing convergence in either size or PSNR
int is_first;
float dq;
float q, last_q;
float qmin, qmax;
double value, last_value; // PSNR or size
double target;
int do_size_search;
} PassStats;
static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
const uint64_t target_size = (uint64_t)enc->config_->target_size;
const int do_size_search = (target_size != 0);
const float target_PSNR = enc->config_->target_PSNR;
s->is_first = 1;
s->dq = 10.f;
s->qmin = 1.f * enc->config_->qmin;
s->qmax = 1.f * enc->config_->qmax;
s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
s->target = do_size_search ? (double)target_size
: (target_PSNR > 0.) ? target_PSNR
: 40.; // default, just in case
s->value = s->last_value = 0.;
s->do_size_search = do_size_search;
return do_size_search;
}
static float ComputeNextQ(PassStats* const s) {
float dq;
if (s->is_first) {
dq = (s->value > s->target) ? -s->dq : s->dq;
s->is_first = 0;
} else if (s->value != s->last_value) {
const double slope = (s->target - s->value) / (s->last_value - s->value);
dq = (float)(slope * (s->last_q - s->q));
} else {
dq = 0.; // we're done?!
}
// Limit variable to avoid large swings.
s->dq = Clamp(dq, -30.f, 30.f);
s->last_q = s->q;
s->last_value = s->value;
s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
return s->q;
}
//------------------------------------------------------------------------------
// Tables for level coding
const uint8_t VP8Cat3[] = { 173, 148, 140 };
const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
const uint8_t VP8Cat6[] =
{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
//------------------------------------------------------------------------------
// Reset the statistics about: number of skips, token proba, level cost,...
static void ResetStats(VP8Encoder* const enc) {
VP8EncProba* const proba = &enc->proba_;
VP8CalculateLevelCosts(proba);
proba->nb_skip_ = 0;
}
//------------------------------------------------------------------------------
// Skip decision probability
#define SKIP_PROBA_THRESHOLD 250 // value below which using skip_proba is OK.
static int CalcSkipProba(uint64_t nb, uint64_t total) {
return (int)(total ? (total - nb) * 255 / total : 255);
}
// Returns the bit-cost for coding the skip probability.
static int FinalizeSkipProba(VP8Encoder* const enc) {
VP8EncProba* const proba = &enc->proba_;
const int nb_mbs = enc->mb_w_ * enc->mb_h_;
const int nb_events = proba->nb_skip_;
int size;
proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
size = 256; // 'use_skip_proba' bit
if (proba->use_skip_proba_) {
size += nb_events * VP8BitCost(1, proba->skip_proba_)
+ (nb_mbs - nb_events) * VP8BitCost(0, proba->skip_proba_);
size += 8 * 256; // cost of signaling the skip_proba_ itself.
}
return size;
}
// Collect statistics and deduce probabilities for next coding pass.
// Return the total bit-cost for coding the probability updates.
static int CalcTokenProba(int nb, int total) {
assert(nb <= total);
return nb ? (255 - nb * 255 / total) : 255;
}
// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
static int BranchCost(int nb, int total, int proba) {
return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
}
static void ResetTokenStats(VP8Encoder* const enc) {
VP8EncProba* const proba = &enc->proba_;
memset(proba->stats_, 0, sizeof(proba->stats_));
}
static int FinalizeTokenProbas(VP8EncProba* const proba) {
int has_changed = 0;
int size = 0;
int t, b, c, p;
for (t = 0; t < NUM_TYPES; ++t) {
for (b = 0; b < NUM_BANDS; ++b) {
for (c = 0; c < NUM_CTX; ++c) {
for (p = 0; p < NUM_PROBAS; ++p) {
const proba_t stats = proba->stats_[t][b][c][p];
const int nb = (stats >> 0) & 0xffff;
const int total = (stats >> 16) & 0xffff;
const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
const int old_p = VP8CoeffsProba0[t][b][c][p];
const int new_p = CalcTokenProba(nb, total);
const int old_cost = BranchCost(nb, total, old_p)
+ VP8BitCost(0, update_proba);
const int new_cost = BranchCost(nb, total, new_p)
+ VP8BitCost(1, update_proba)
+ 8 * 256;
const int use_new_p = (old_cost > new_cost);
size += VP8BitCost(use_new_p, update_proba);
if (use_new_p) { // only use proba that seem meaningful enough.
proba->coeffs_[t][b][c][p] = new_p;
has_changed |= (new_p != old_p);
size += 8 * 256;
} else {
proba->coeffs_[t][b][c][p] = old_p;
}
}
}
}
}
proba->dirty_ = has_changed;
return size;
}
//------------------------------------------------------------------------------
// Finalize Segment probability based on the coding tree
static int GetProba(int a, int b) {
const int total = a + b;
return (total == 0) ? 255 // that's the default probability.
: (255 * a + total / 2) / total; // rounded proba
}
static void ResetSegments(VP8Encoder* const enc) {
int n;
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
enc->mb_info_[n].segment_ = 0;
}
}
static void SetSegmentProbas(VP8Encoder* const enc) {
int p[NUM_MB_SEGMENTS] = { 0 };
int n;
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
const VP8MBInfo* const mb = &enc->mb_info_[n];
++p[mb->segment_];
}
#if !defined(WEBP_DISABLE_STATS)
if (enc->pic_->stats != NULL) {
for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
enc->pic_->stats->segment_size[n] = p[n];
}
}
#endif
if (enc->segment_hdr_.num_segments_ > 1) {
uint8_t* const probas = enc->proba_.segments_;
probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
probas[1] = GetProba(p[0], p[1]);
probas[2] = GetProba(p[2], p[3]);
enc->segment_hdr_.update_map_ =
(probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
enc->segment_hdr_.size_ =
p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
} else {
enc->segment_hdr_.update_map_ = 0;
enc->segment_hdr_.size_ = 0;
}
}
//------------------------------------------------------------------------------
// Coefficient coding
static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const uint8_t* p = res->prob[n][ctx];
if (!VP8PutBit(bw, res->last >= 0, p[0])) {
return 0;
}
while (n < 16) {
const int c = res->coeffs[n++];
const int sign = c < 0;
int v = sign ? -c : c;
if (!VP8PutBit(bw, v != 0, p[1])) {
p = res->prob[VP8EncBands[n]][0];
continue;
}
if (!VP8PutBit(bw, v > 1, p[2])) {
p = res->prob[VP8EncBands[n]][1];
} else {
if (!VP8PutBit(bw, v > 4, p[3])) {
if (VP8PutBit(bw, v != 2, p[4])) {
VP8PutBit(bw, v == 4, p[5]);
}
} else if (!VP8PutBit(bw, v > 10, p[6])) {
if (!VP8PutBit(bw, v > 6, p[7])) {
VP8PutBit(bw, v == 6, 159);
} else {
VP8PutBit(bw, v >= 9, 165);
VP8PutBit(bw, !(v & 1), 145);
}
} else {
int mask;
const uint8_t* tab;
if (v < 3 + (8 << 1)) { // VP8Cat3 (3b)
VP8PutBit(bw, 0, p[8]);
VP8PutBit(bw, 0, p[9]);
v -= 3 + (8 << 0);
mask = 1 << 2;
tab = VP8Cat3;
} else if (v < 3 + (8 << 2)) { // VP8Cat4 (4b)
VP8PutBit(bw, 0, p[8]);
VP8PutBit(bw, 1, p[9]);
v -= 3 + (8 << 1);
mask = 1 << 3;
tab = VP8Cat4;
} else if (v < 3 + (8 << 3)) { // VP8Cat5 (5b)
VP8PutBit(bw, 1, p[8]);
VP8PutBit(bw, 0, p[10]);
v -= 3 + (8 << 2);
mask = 1 << 4;
tab = VP8Cat5;
} else { // VP8Cat6 (11b)
VP8PutBit(bw, 1, p[8]);
VP8PutBit(bw, 1, p[10]);
v -= 3 + (8 << 3);
mask = 1 << 10;
tab = VP8Cat6;
}
while (mask) {
VP8PutBit(bw, !!(v & mask), *tab++);
mask >>= 1;
}
}
p = res->prob[VP8EncBands[n]][2];
}
VP8PutBitUniform(bw, sign);
if (n == 16 || !VP8PutBit(bw, n <= res->last, p[0])) {
return 1; // EOB
}
}
return 1;
}
static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
const VP8ModeScore* const rd) {
int x, y, ch;
VP8Residual res;
uint64_t pos1, pos2, pos3;
const int i16 = (it->mb_->type_ == 1);
const int segment = it->mb_->segment_;
VP8Encoder* const enc = it->enc_;
VP8IteratorNzToBytes(it);
pos1 = VP8BitWriterPos(bw);
if (i16) {
VP8InitResidual(0, 1, enc, &res);
VP8SetResidualCoeffs(rd->y_dc_levels, &res);
it->top_nz_[8] = it->left_nz_[8] =
PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
VP8InitResidual(1, 0, enc, &res);
} else {
VP8InitResidual(0, 3, enc, &res);
}
// luma-AC
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
const int ctx = it->top_nz_[x] + it->left_nz_[y];
VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
}
}
pos2 = VP8BitWriterPos(bw);
// U/V
VP8InitResidual(0, 2, enc, &res);
for (ch = 0; ch <= 2; ch += 2) {
for (y = 0; y < 2; ++y) {
for (x = 0; x < 2; ++x) {
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
PutCoeffs(bw, ctx, &res);
}
}
}
pos3 = VP8BitWriterPos(bw);
it->luma_bits_ = pos2 - pos1;
it->uv_bits_ = pos3 - pos2;
it->bit_count_[segment][i16] += it->luma_bits_;
it->bit_count_[segment][2] += it->uv_bits_;
VP8IteratorBytesToNz(it);
}
// Same as CodeResiduals, but doesn't actually write anything.
// Instead, it just records the event distribution.
static void RecordResiduals(VP8EncIterator* const it,
const VP8ModeScore* const rd) {
int x, y, ch;
VP8Residual res;
VP8Encoder* const enc = it->enc_;
VP8IteratorNzToBytes(it);
if (it->mb_->type_ == 1) { // i16x16
VP8InitResidual(0, 1, enc, &res);
VP8SetResidualCoeffs(rd->y_dc_levels, &res);
it->top_nz_[8] = it->left_nz_[8] =
VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
VP8InitResidual(1, 0, enc, &res);
} else {
VP8InitResidual(0, 3, enc, &res);
}
// luma-AC
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
const int ctx = it->top_nz_[x] + it->left_nz_[y];
VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
}
}
// U/V
VP8InitResidual(0, 2, enc, &res);
for (ch = 0; ch <= 2; ch += 2) {
for (y = 0; y < 2; ++y) {
for (x = 0; x < 2; ++x) {
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
VP8RecordCoeffs(ctx, &res);
}
}
}
VP8IteratorBytesToNz(it);
}
//------------------------------------------------------------------------------
// Token buffer
#if !defined(DISABLE_TOKEN_BUFFER)
static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
VP8TBuffer* const tokens) {
int x, y, ch;
VP8Residual res;
VP8Encoder* const enc = it->enc_;
VP8IteratorNzToBytes(it);
if (it->mb_->type_ == 1) { // i16x16
const int ctx = it->top_nz_[8] + it->left_nz_[8];
VP8InitResidual(0, 1, enc, &res);
VP8SetResidualCoeffs(rd->y_dc_levels, &res);
it->top_nz_[8] = it->left_nz_[8] =
VP8RecordCoeffTokens(ctx, &res, tokens);
VP8InitResidual(1, 0, enc, &res);
} else {
VP8InitResidual(0, 3, enc, &res);
}
// luma-AC
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
const int ctx = it->top_nz_[x] + it->left_nz_[y];
VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
it->top_nz_[x] = it->left_nz_[y] =
VP8RecordCoeffTokens(ctx, &res, tokens);
}
}
// U/V
VP8InitResidual(0, 2, enc, &res);
for (ch = 0; ch <= 2; ch += 2) {
for (y = 0; y < 2; ++y) {
for (x = 0; x < 2; ++x) {
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
VP8RecordCoeffTokens(ctx, &res, tokens);
}
}
}
VP8IteratorBytesToNz(it);
return !tokens->error_;
}
#endif // !DISABLE_TOKEN_BUFFER
//------------------------------------------------------------------------------
// ExtraInfo map / Debug function
#if !defined(WEBP_DISABLE_STATS)
#if SEGMENT_VISU
static void SetBlock(uint8_t* p, int value, int size) {
int y;
for (y = 0; y < size; ++y) {
memset(p, value, size);
p += BPS;
}
}
#endif
static void ResetSSE(VP8Encoder* const enc) {
enc->sse_[0] = 0;
enc->sse_[1] = 0;
enc->sse_[2] = 0;
// Note: enc->sse_[3] is managed by alpha.c
enc->sse_count_ = 0;
}
static void StoreSSE(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
const uint8_t* const in = it->yuv_in_;
const uint8_t* const out = it->yuv_out_;
// Note: not totally accurate at boundary. And doesn't include in-loop filter.
enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
enc->sse_count_ += 16 * 16;
}
static void StoreSideInfo(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
const VP8MBInfo* const mb = it->mb_;
WebPPicture* const pic = enc->pic_;
if (pic->stats != NULL) {
StoreSSE(it);
enc->block_count_[0] += (mb->type_ == 0);
enc->block_count_[1] += (mb->type_ == 1);
enc->block_count_[2] += (mb->skip_ != 0);
}
if (pic->extra_info != NULL) {
uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
switch (pic->extra_info_type) {
case 1: *info = mb->type_; break;
case 2: *info = mb->segment_; break;
case 3: *info = enc->dqm_[mb->segment_].quant_; break;
case 4: *info = (mb->type_ == 1) ? it->preds_[0] : 0xff; break;
case 5: *info = mb->uv_mode_; break;
case 6: {
const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
*info = (b > 255) ? 255 : b; break;
}
case 7: *info = mb->alpha_; break;
default: *info = 0; break;
}
}
#if SEGMENT_VISU // visualize segments and prediction modes
SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
#endif
}
static void ResetSideInfo(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
WebPPicture* const pic = enc->pic_;
if (pic->stats != NULL) {
memset(enc->block_count_, 0, sizeof(enc->block_count_));
}
ResetSSE(enc);
}
#else // defined(WEBP_DISABLE_STATS)
static void ResetSSE(VP8Encoder* const enc) {
(void)enc;
}
static void StoreSideInfo(const VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
WebPPicture* const pic = enc->pic_;
if (pic->extra_info != NULL) {
if (it->x_ == 0 && it->y_ == 0) { // only do it once, at start
memset(pic->extra_info, 0,
enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
}
}
}
static void ResetSideInfo(const VP8EncIterator* const it) {
(void)it;
}
#endif // !defined(WEBP_DISABLE_STATS)
static double GetPSNR(uint64_t mse, uint64_t size) {
return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
}
//------------------------------------------------------------------------------
// StatLoop(): only collect statistics (number of skips, token usage, ...).
// This is used for deciding optimal probabilities. It also modifies the
// quantizer value if some target (size, PSNR) was specified.
static void SetLoopParams(VP8Encoder* const enc, float q) {
// Make sure the quality parameter is inside valid bounds
q = Clamp(q, 0.f, 100.f);
VP8SetSegmentParams(enc, q); // setup segment quantizations and filters
SetSegmentProbas(enc); // compute segment probabilities
ResetStats(enc);
ResetSSE(enc);
}
static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
int nb_mbs, int percent_delta,
PassStats* const s) {
VP8EncIterator it;
uint64_t size = 0;
uint64_t size_p0 = 0;
uint64_t distortion = 0;
const uint64_t pixel_count = nb_mbs * 384;
VP8IteratorInit(enc, &it);
SetLoopParams(enc, s->q);
do {
VP8ModeScore info;
VP8IteratorImport(&it, NULL);
if (VP8Decimate(&it, &info, rd_opt)) {
// Just record the number of skips and act like skip_proba is not used.
++enc->proba_.nb_skip_;
}
RecordResiduals(&it, &info);
size += info.R + info.H;
size_p0 += info.H;
distortion += info.D;
if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
return 0;
}
VP8IteratorSaveBoundary(&it);
} while (VP8IteratorNext(&it) && --nb_mbs > 0);
size_p0 += enc->segment_hdr_.size_;
if (s->do_size_search) {
size += FinalizeSkipProba(enc);
size += FinalizeTokenProbas(&enc->proba_);
size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
s->value = (double)size;
} else {
s->value = GetPSNR(distortion, pixel_count);
}
return size_p0;
}
static int StatLoop(VP8Encoder* const enc) {
const int method = enc->method_;
const int do_search = enc->do_search_;
const int fast_probe = ((method == 0 || method == 3) && !do_search);
int num_pass_left = enc->config_->pass;
const int task_percent = 20;
const int percent_per_pass =
(task_percent + num_pass_left / 2) / num_pass_left;
const int final_percent = enc->percent_ + task_percent;
const VP8RDLevel rd_opt =
(method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
int nb_mbs = enc->mb_w_ * enc->mb_h_;
PassStats stats;
InitPassStats(enc, &stats);
ResetTokenStats(enc);
// Fast mode: quick analysis pass over few mbs. Better than nothing.
if (fast_probe) {
if (method == 3) { // we need more stats for method 3 to be reliable.
nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
} else {
nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
}
}
while (num_pass_left-- > 0) {
const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
(num_pass_left == 0) ||
(enc->max_i4_header_bits_ == 0);
const uint64_t size_p0 =
OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
if (size_p0 == 0) return 0;
#if (DEBUG_SEARCH > 0)
printf("#%d value:%.1lf -> %.1lf q:%.2f -> %.2f\n",
num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
#endif
if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
++num_pass_left;
enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation...
continue; // ...and start over
}
if (is_last_pass) {
break;
}
// If no target size: just do several pass without changing 'q'
if (do_search) {
ComputeNextQ(&stats);
if (fabs(stats.dq) <= DQ_LIMIT) break;
}
}
if (!do_search || !stats.do_size_search) {
// Need to finalize probas now, since it wasn't done during the search.
FinalizeSkipProba(enc);
FinalizeTokenProbas(&enc->proba_);
}
VP8CalculateLevelCosts(&enc->proba_); // finalize costs
return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
}
//------------------------------------------------------------------------------
// Main loops
//
static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
static int PreLoopInitialize(VP8Encoder* const enc) {
int p;
int ok = 1;
const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
const int bytes_per_parts =
enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
// Initialize the bit-writers
for (p = 0; ok && p < enc->num_parts_; ++p) {
ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
}
if (!ok) {
VP8EncFreeBitWriters(enc); // malloc error occurred
WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
}
return ok;
}
static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
VP8Encoder* const enc = it->enc_;
if (ok) { // Finalize the partitions, check for extra errors.
int p;
for (p = 0; p < enc->num_parts_; ++p) {
VP8BitWriterFinish(enc->parts_ + p);
ok &= !enc->parts_[p].error_;
}
}
if (ok) { // All good. Finish up.
#if !defined(WEBP_DISABLE_STATS)
if (enc->pic_->stats != NULL) { // finalize byte counters...
int i, s;
for (i = 0; i <= 2; ++i) {
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
}
}
}
#endif
VP8AdjustFilterStrength(it); // ...and store filter stats.
} else {
// Something bad happened -> need to do some memory cleanup.
VP8EncFreeBitWriters(enc);
}
return ok;
}
//------------------------------------------------------------------------------
// VP8EncLoop(): does the final bitstream coding.
static void ResetAfterSkip(VP8EncIterator* const it) {
if (it->mb_->type_ == 1) {
*it->nz_ = 0; // reset all predictors
it->left_nz_[8] = 0;
} else {
*it->nz_ &= (1 << 24); // preserve the dc_nz bit
}
}
int VP8EncLoop(VP8Encoder* const enc) {
VP8EncIterator it;
int ok = PreLoopInitialize(enc);
if (!ok) return 0;
StatLoop(enc); // stats-collection loop
VP8IteratorInit(enc, &it);
VP8InitFilter(&it);
do {
VP8ModeScore info;
const int dont_use_skip = !enc->proba_.use_skip_proba_;
const VP8RDLevel rd_opt = enc->rd_opt_level_;
VP8IteratorImport(&it, NULL);
// Warning! order is important: first call VP8Decimate() and
// *then* decide how to code the skip decision if there's one.
if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
CodeResiduals(it.bw_, &it, &info);
} else { // reset predictors after a skip
ResetAfterSkip(&it);
}
StoreSideInfo(&it);
VP8StoreFilterStats(&it);
VP8IteratorExport(&it);
ok = VP8IteratorProgress(&it, 20);
VP8IteratorSaveBoundary(&it);
} while (ok && VP8IteratorNext(&it));
return PostLoopFinalize(&it, ok);
}
//------------------------------------------------------------------------------
// Single pass using Token Buffer.
#if !defined(DISABLE_TOKEN_BUFFER)
#define MIN_COUNT 96 // minimum number of macroblocks before updating stats
int VP8EncTokenLoop(VP8Encoder* const enc) {
// Roughly refresh the proba eight times per pass
int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
int num_pass_left = enc->config_->pass;
const int do_search = enc->do_search_;
VP8EncIterator it;
VP8EncProba* const proba = &enc->proba_;
const VP8RDLevel rd_opt = enc->rd_opt_level_;
const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
PassStats stats;
int ok;
InitPassStats(enc, &stats);
ok = PreLoopInitialize(enc);
if (!ok) return 0;
if (max_count < MIN_COUNT) max_count = MIN_COUNT;
assert(enc->num_parts_ == 1);
assert(enc->use_tokens_);
assert(proba->use_skip_proba_ == 0);
assert(rd_opt >= RD_OPT_BASIC); // otherwise, token-buffer won't be useful
assert(num_pass_left > 0);
while (ok && num_pass_left-- > 0) {
const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
(num_pass_left == 0) ||
(enc->max_i4_header_bits_ == 0);
uint64_t size_p0 = 0;
uint64_t distortion = 0;
int cnt = max_count;
VP8IteratorInit(enc, &it);
SetLoopParams(enc, stats.q);
if (is_last_pass) {
ResetTokenStats(enc);
VP8InitFilter(&it); // don't collect stats until last pass (too costly)
}
VP8TBufferClear(&enc->tokens_);
do {
VP8ModeScore info;
VP8IteratorImport(&it, NULL);
if (--cnt < 0) {
FinalizeTokenProbas(proba);
VP8CalculateLevelCosts(proba); // refresh cost tables for rd-opt
cnt = max_count;
}
VP8Decimate(&it, &info, rd_opt);
ok = RecordTokens(&it, &info, &enc->tokens_);
if (!ok) {
WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
break;
}
size_p0 += info.H;
distortion += info.D;
if (is_last_pass) {
StoreSideInfo(&it);
VP8StoreFilterStats(&it);
VP8IteratorExport(&it);
ok = VP8IteratorProgress(&it, 20);
}
VP8IteratorSaveBoundary(&it);
} while (ok && VP8IteratorNext(&it));
if (!ok) break;
size_p0 += enc->segment_hdr_.size_;
if (stats.do_size_search) {
uint64_t size = FinalizeTokenProbas(&enc->proba_);
size += VP8EstimateTokenSize(&enc->tokens_,
(const uint8_t*)proba->coeffs_);
size = (size + size_p0 + 1024) >> 11; // -> size in bytes
size += HEADER_SIZE_ESTIMATE;
stats.value = (double)size;
} else { // compute and store PSNR
stats.value = GetPSNR(distortion, pixel_count);
}
#if (DEBUG_SEARCH > 0)
printf("#%2d metric:%.1lf -> %.1lf last_q=%.2lf q=%.2lf dq=%.2lf "
" range:[%.1f, %.1f]\n",
num_pass_left, stats.last_value, stats.value,
stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
#endif
if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
++num_pass_left;
enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation...
if (is_last_pass) {
ResetSideInfo(&it);
}
continue; // ...and start over
}
if (is_last_pass) {
break; // done
}
if (do_search) {
ComputeNextQ(&stats); // Adjust q
}
}
if (ok) {
if (!stats.do_size_search) {
FinalizeTokenProbas(&enc->proba_);
}
ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
(const uint8_t*)proba->coeffs_, 1);
}
ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
return PostLoopFinalize(&it, ok);
}
#else
int VP8EncTokenLoop(VP8Encoder* const enc) {
(void)enc;
return 0; // we shouldn't be here.
}
#endif // DISABLE_TOKEN_BUFFER
//------------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,459 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// VP8Iterator: block iterator
//
// Author: Skal (pascal.massimino@gmail.com)
#include <string.h>
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// VP8Iterator
//------------------------------------------------------------------------------
static void InitLeft(VP8EncIterator* const it) {
it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
(it->y_ > 0) ? 129 : 127;
memset(it->y_left_, 129, 16);
memset(it->u_left_, 129, 8);
memset(it->v_left_, 129, 8);
it->left_nz_[8] = 0;
if (it->top_derr_ != NULL) {
memset(&it->left_derr_, 0, sizeof(it->left_derr_));
}
}
static void InitTop(VP8EncIterator* const it) {
const VP8Encoder* const enc = it->enc_;
const size_t top_size = enc->mb_w_ * 16;
memset(enc->y_top_, 127, 2 * top_size);
memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
if (enc->top_derr_ != NULL) {
memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
}
}
void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
VP8Encoder* const enc = it->enc_;
it->x_ = 0;
it->y_ = y;
it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
it->nz_ = enc->nz_;
it->mb_ = enc->mb_info_ + y * enc->mb_w_;
it->y_top_ = enc->y_top_;
it->uv_top_ = enc->uv_top_;
InitLeft(it);
}
void VP8IteratorReset(VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
VP8IteratorSetRow(it, 0);
VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_); // default
InitTop(it);
memset(it->bit_count_, 0, sizeof(it->bit_count_));
it->do_trellis_ = 0;
}
void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
it->count_down_ = it->count_down0_ = count_down;
}
int VP8IteratorIsDone(const VP8EncIterator* const it) {
return (it->count_down_ <= 0);
}
void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
it->enc_ = enc;
it->yuv_in_ = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
it->yuv_out_ = it->yuv_in_ + YUV_SIZE_ENC;
it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
it->yuv_p_ = it->yuv_out2_ + YUV_SIZE_ENC;
it->lf_stats_ = enc->lf_stats_;
it->percent0_ = enc->percent_;
it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
it->u_left_ = it->y_left_ + 16 + 16;
it->v_left_ = it->u_left_ + 16;
it->top_derr_ = enc->top_derr_;
VP8IteratorReset(it);
}
int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
VP8Encoder* const enc = it->enc_;
if (delta && enc->pic_->progress_hook != NULL) {
const int done = it->count_down0_ - it->count_down_;
const int percent = (it->count_down0_ <= 0)
? it->percent0_
: it->percent0_ + delta * done / it->count_down0_;
return WebPReportProgress(enc->pic_, percent, &enc->percent_);
}
return 1;
}
//------------------------------------------------------------------------------
// Import the source samples into the cache. Takes care of replicating
// boundary pixels if necessary.
static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
static void ImportBlock(const uint8_t* src, int src_stride,
uint8_t* dst, int w, int h, int size) {
int i;
for (i = 0; i < h; ++i) {
memcpy(dst, src, w);
if (w < size) {
memset(dst + w, dst[w - 1], size - w);
}
dst += BPS;
src += src_stride;
}
for (i = h; i < size; ++i) {
memcpy(dst, dst - BPS, size);
dst += BPS;
}
}
static void ImportLine(const uint8_t* src, int src_stride,
uint8_t* dst, int len, int total_len) {
int i;
for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
for (; i < total_len; ++i) dst[i] = dst[len - 1];
}
void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) {
const VP8Encoder* const enc = it->enc_;
const int x = it->x_, y = it->y_;
const WebPPicture* const pic = enc->pic_;
const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
const int w = MinSize(pic->width - x * 16, 16);
const int h = MinSize(pic->height - y * 16, 16);
const int uv_w = (w + 1) >> 1;
const int uv_h = (h + 1) >> 1;
ImportBlock(ysrc, pic->y_stride, it->yuv_in_ + Y_OFF_ENC, w, h, 16);
ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
if (tmp_32 == NULL) return;
// Import source (uncompressed) samples into boundary.
if (x == 0) {
InitLeft(it);
} else {
if (y == 0) {
it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
} else {
it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
}
ImportLine(ysrc - 1, pic->y_stride, it->y_left_, h, 16);
ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
}
it->y_top_ = tmp_32 + 0;
it->uv_top_ = tmp_32 + 16;
if (y == 0) {
memset(tmp_32, 127, 32 * sizeof(*tmp_32));
} else {
ImportLine(ysrc - pic->y_stride, 1, tmp_32, w, 16);
ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16, uv_w, 8);
ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
}
}
//------------------------------------------------------------------------------
// Copy back the compressed samples into user space if requested.
static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
int w, int h) {
while (h-- > 0) {
memcpy(dst, src, w);
dst += dst_stride;
src += BPS;
}
}
void VP8IteratorExport(const VP8EncIterator* const it) {
const VP8Encoder* const enc = it->enc_;
if (enc->config_->show_compressed) {
const int x = it->x_, y = it->y_;
const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
const WebPPicture* const pic = enc->pic_;
uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
int w = (pic->width - x * 16);
int h = (pic->height - y * 16);
if (w > 16) w = 16;
if (h > 16) h = 16;
// Luma plane
ExportBlock(ysrc, ydst, pic->y_stride, w, h);
{ // U/V planes
const int uv_w = (w + 1) >> 1;
const int uv_h = (h + 1) >> 1;
ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
}
}
}
//------------------------------------------------------------------------------
// Non-zero contexts setup/teardown
// Nz bits:
// 0 1 2 3 Y
// 4 5 6 7
// 8 9 10 11
// 12 13 14 15
// 16 17 U
// 18 19
// 20 21 V
// 22 23
// 24 DC-intra16
// Convert packed context to byte array
#define BIT(nz, n) (!!((nz) & (1 << (n))))
void VP8IteratorNzToBytes(VP8EncIterator* const it) {
const int tnz = it->nz_[0], lnz = it->nz_[-1];
int* const top_nz = it->top_nz_;
int* const left_nz = it->left_nz_;
// Top-Y
top_nz[0] = BIT(tnz, 12);
top_nz[1] = BIT(tnz, 13);
top_nz[2] = BIT(tnz, 14);
top_nz[3] = BIT(tnz, 15);
// Top-U
top_nz[4] = BIT(tnz, 18);
top_nz[5] = BIT(tnz, 19);
// Top-V
top_nz[6] = BIT(tnz, 22);
top_nz[7] = BIT(tnz, 23);
// DC
top_nz[8] = BIT(tnz, 24);
// left-Y
left_nz[0] = BIT(lnz, 3);
left_nz[1] = BIT(lnz, 7);
left_nz[2] = BIT(lnz, 11);
left_nz[3] = BIT(lnz, 15);
// left-U
left_nz[4] = BIT(lnz, 17);
left_nz[5] = BIT(lnz, 19);
// left-V
left_nz[6] = BIT(lnz, 21);
left_nz[7] = BIT(lnz, 23);
// left-DC is special, iterated separately
}
void VP8IteratorBytesToNz(VP8EncIterator* const it) {
uint32_t nz = 0;
const int* const top_nz = it->top_nz_;
const int* const left_nz = it->left_nz_;
// top
nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
nz |= (top_nz[8] << 24); // we propagate the _top_ bit, esp. for intra4
// left
nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
nz |= (left_nz[2] << 11);
nz |= (left_nz[4] << 17) | (left_nz[6] << 21);
*it->nz_ = nz;
}
#undef BIT
//------------------------------------------------------------------------------
// Advance to the next position, doing the bookkeeping.
void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
VP8Encoder* const enc = it->enc_;
const int x = it->x_, y = it->y_;
const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
if (x < enc->mb_w_ - 1) { // left
int i;
for (i = 0; i < 16; ++i) {
it->y_left_[i] = ysrc[15 + i * BPS];
}
for (i = 0; i < 8; ++i) {
it->u_left_[i] = uvsrc[7 + i * BPS];
it->v_left_[i] = uvsrc[15 + i * BPS];
}
// top-left (before 'top'!)
it->y_left_[-1] = it->y_top_[15];
it->u_left_[-1] = it->uv_top_[0 + 7];
it->v_left_[-1] = it->uv_top_[8 + 7];
}
if (y < enc->mb_h_ - 1) { // top
memcpy(it->y_top_, ysrc + 15 * BPS, 16);
memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
}
}
int VP8IteratorNext(VP8EncIterator* const it) {
if (++it->x_ == it->enc_->mb_w_) {
VP8IteratorSetRow(it, ++it->y_);
} else {
it->preds_ += 4;
it->mb_ += 1;
it->nz_ += 1;
it->y_top_ += 16;
it->uv_top_ += 16;
}
return (0 < --it->count_down_);
}
//------------------------------------------------------------------------------
// Helper function to set mode properties
void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
uint8_t* preds = it->preds_;
int y;
for (y = 0; y < 4; ++y) {
memset(preds, mode, 4);
preds += it->enc_->preds_w_;
}
it->mb_->type_ = 1;
}
void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
uint8_t* preds = it->preds_;
int y;
for (y = 4; y > 0; --y) {
memcpy(preds, modes, 4 * sizeof(*modes));
preds += it->enc_->preds_w_;
modes += 4;
}
it->mb_->type_ = 0;
}
void VP8SetIntraUVMode(const VP8EncIterator* const it, int mode) {
it->mb_->uv_mode_ = mode;
}
void VP8SetSkip(const VP8EncIterator* const it, int skip) {
it->mb_->skip_ = skip;
}
void VP8SetSegment(const VP8EncIterator* const it, int segment) {
it->mb_->segment_ = segment;
}
//------------------------------------------------------------------------------
// Intra4x4 sub-blocks iteration
//
// We store and update the boundary samples into an array of 37 pixels. They
// are updated as we iterate and reconstructs each intra4x4 blocks in turn.
// The position of the samples has the following snake pattern:
//
// 16|17 18 19 20|21 22 23 24|25 26 27 28|29 30 31 32|33 34 35 36 <- Top-right
// --+-----------+-----------+-----------+-----------+
// 15| 19| 23| 27| 31|
// 14| 18| 22| 26| 30|
// 13| 17| 21| 25| 29|
// 12|13 14 15 16|17 18 19 20|21 22 23 24|25 26 27 28|
// --+-----------+-----------+-----------+-----------+
// 11| 15| 19| 23| 27|
// 10| 14| 18| 22| 26|
// 9| 13| 17| 21| 25|
// 8| 9 10 11 12|13 14 15 16|17 18 19 20|21 22 23 24|
// --+-----------+-----------+-----------+-----------+
// 7| 11| 15| 19| 23|
// 6| 10| 14| 18| 22|
// 5| 9| 13| 17| 21|
// 4| 5 6 7 8| 9 10 11 12|13 14 15 16|17 18 19 20|
// --+-----------+-----------+-----------+-----------+
// 3| 7| 11| 15| 19|
// 2| 6| 10| 14| 18|
// 1| 5| 9| 13| 17|
// 0| 1 2 3 4| 5 6 7 8| 9 10 11 12|13 14 15 16|
// --+-----------+-----------+-----------+-----------+
// Array to record the position of the top sample to pass to the prediction
// functions in dsp.c.
static const uint8_t VP8TopLeftI4[16] = {
17, 21, 25, 29,
13, 17, 21, 25,
9, 13, 17, 21,
5, 9, 13, 17
};
void VP8IteratorStartI4(VP8EncIterator* const it) {
const VP8Encoder* const enc = it->enc_;
int i;
it->i4_ = 0; // first 4x4 sub-block
it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[0];
// Import the boundary samples
for (i = 0; i < 17; ++i) { // left
it->i4_boundary_[i] = it->y_left_[15 - i];
}
for (i = 0; i < 16; ++i) { // top
it->i4_boundary_[17 + i] = it->y_top_[i];
}
// top-right samples have a special case on the far right of the picture
if (it->x_ < enc->mb_w_ - 1) {
for (i = 16; i < 16 + 4; ++i) {
it->i4_boundary_[17 + i] = it->y_top_[i];
}
} else { // else, replicate the last valid pixel four times
for (i = 16; i < 16 + 4; ++i) {
it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
}
}
VP8IteratorNzToBytes(it); // import the non-zero context
}
int VP8IteratorRotateI4(VP8EncIterator* const it,
const uint8_t* const yuv_out) {
const uint8_t* const blk = yuv_out + VP8Scan[it->i4_];
uint8_t* const top = it->i4_top_;
int i;
// Update the cache with 7 fresh samples
for (i = 0; i <= 3; ++i) {
top[-4 + i] = blk[i + 3 * BPS]; // store future top samples
}
if ((it->i4_ & 3) != 3) { // if not on the right sub-blocks #3, #7, #11, #15
for (i = 0; i <= 2; ++i) { // store future left samples
top[i] = blk[3 + (2 - i) * BPS];
}
} else { // else replicate top-right samples, as says the specs.
for (i = 0; i <= 3; ++i) {
top[i] = top[i + 4];
}
}
// move pointers to next sub-block
++it->i4_;
if (it->i4_ == 16) { // we're done
return 0;
}
it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[it->i4_];
return 1;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,46 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
with Files('**'):
BUG_COMPONENT = ('Core', 'ImageLib')
SOURCES += [
'alpha_enc.c',
'analysis_enc.c',
'backward_references_cost_enc.c',
'backward_references_enc.c',
'config_enc.c',
'cost_enc.c',
'filter_enc.c',
'frame_enc.c',
'histogram_enc.c',
'iterator_enc.c',
'near_lossless_enc.c',
'picture_csp_enc.c',
'picture_enc.c',
'picture_psnr_enc.c',
'picture_rescale_enc.c',
'picture_tools_enc.c',
'predictor_enc.c',
'quant_enc.c',
'syntax_enc.c',
'token_enc.c',
'tree_enc.c',
'vp8l_enc.c',
'webp_enc.c',
]
LOCAL_INCLUDES += [
'/media/libwebp',
]
# Add libFuzzer configuration directives
include('/tools/fuzzing/libfuzzer-config.mozbuild')
FINAL_LIBRARY = 'gkmedias'
# We allow warnings for third-party code that can be updated from upstream.
AllowCompilerWarnings()

View File

@ -0,0 +1,151 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Near-lossless image preprocessing adjusts pixel values to help
// compressibility with a guarantee of maximum deviation between original and
// resulting pixel values.
//
// Author: Jyrki Alakuijala (jyrki@google.com)
// Converted to C by Aleksander Kramarz (akramarz@google.com)
#include <assert.h>
#include <stdlib.h>
#include "src/dsp/lossless_common.h"
#include "src/utils/utils.h"
#include "src/enc/vp8li_enc.h"
#if (WEBP_NEAR_LOSSLESS == 1)
#define MIN_DIM_FOR_NEAR_LOSSLESS 64
#define MAX_LIMIT_BITS 5
// Quantizes the value up or down to a multiple of 1<<bits (or to 255),
// choosing the closer one, resolving ties using bankers' rounding.
static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
const uint32_t mask = (1u << bits) - 1;
const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
assert(bits > 0);
if (biased > 0xff) return 0xff;
return biased & ~mask;
}
// Applies FindClosestDiscretized to all channels of pixel.
static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
return
(FindClosestDiscretized(a >> 24, bits) << 24) |
(FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
(FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
(FindClosestDiscretized(a & 0xff, bits));
}
// Checks if distance between corresponding channel values of pixels a and b
// is within the given limit.
static int IsNear(uint32_t a, uint32_t b, int limit) {
int k;
for (k = 0; k < 4; ++k) {
const int delta =
(int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
if (delta >= limit || delta <= -limit) {
return 0;
}
}
return 1;
}
static int IsSmooth(const uint32_t* const prev_row,
const uint32_t* const curr_row,
const uint32_t* const next_row,
int ix, int limit) {
// Check that all pixels in 4-connected neighborhood are smooth.
return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
IsNear(curr_row[ix], prev_row[ix], limit) &&
IsNear(curr_row[ix], next_row[ix], limit));
}
// Adjusts pixel values of image with given maximum error.
static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
int stride, int limit_bits, uint32_t* copy_buffer,
uint32_t* argb_dst) {
int x, y;
const int limit = 1 << limit_bits;
uint32_t* prev_row = copy_buffer;
uint32_t* curr_row = prev_row + xsize;
uint32_t* next_row = curr_row + xsize;
memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
if (y == 0 || y == ysize - 1) {
memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
} else {
memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
argb_dst[0] = argb_src[0];
argb_dst[xsize - 1] = argb_src[xsize - 1];
for (x = 1; x < xsize - 1; ++x) {
if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
argb_dst[x] = curr_row[x];
} else {
argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
}
}
}
{
// Three-way swap.
uint32_t* const temp = prev_row;
prev_row = curr_row;
curr_row = next_row;
next_row = temp;
}
}
}
int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
uint32_t* const argb_dst) {
int i;
const int xsize = picture->width;
const int ysize = picture->height;
const int stride = picture->argb_stride;
uint32_t* const copy_buffer =
(uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
const int limit_bits = VP8LNearLosslessBits(quality);
assert(argb_dst != NULL);
assert(limit_bits > 0);
assert(limit_bits <= MAX_LIMIT_BITS);
if (copy_buffer == NULL) {
return 0;
}
// For small icon images, don't attempt to apply near-lossless compression.
if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
ysize < 3) {
for (i = 0; i < ysize; ++i) {
memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
xsize * sizeof(*argb_dst));
}
WebPSafeFree(copy_buffer);
return 1;
}
NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
argb_dst);
for (i = limit_bits - 1; i != 0; --i) {
NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
}
WebPSafeFree(copy_buffer);
return 1;
}
#else // (WEBP_NEAR_LOSSLESS == 1)
// Define a stub to suppress compiler warnings.
extern void VP8LNearLosslessStub(void);
void VP8LNearLosslessStub(void) {}
#endif // (WEBP_NEAR_LOSSLESS == 1)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,296 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture class basis
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "src/enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// WebPPicture
//------------------------------------------------------------------------------
static int DummyWriter(const uint8_t* data, size_t data_size,
const WebPPicture* const picture) {
// The following are to prevent 'unused variable' error message.
(void)data;
(void)data_size;
(void)picture;
return 1;
}
int WebPPictureInitInternal(WebPPicture* picture, int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
return 0; // caller/system version mismatch!
}
if (picture != NULL) {
memset(picture, 0, sizeof(*picture));
picture->writer = DummyWriter;
WebPEncodingSetError(picture, VP8_ENC_OK);
}
return 1;
}
//------------------------------------------------------------------------------
static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
picture->memory_argb_ = NULL;
picture->argb = NULL;
picture->argb_stride = 0;
}
static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
picture->memory_ = NULL;
picture->y = picture->u = picture->v = picture->a = NULL;
picture->y_stride = picture->uv_stride = 0;
picture->a_stride = 0;
}
void WebPPictureResetBuffers(WebPPicture* const picture) {
WebPPictureResetBufferARGB(picture);
WebPPictureResetBufferYUVA(picture);
}
int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
void* memory;
const uint64_t argb_size = (uint64_t)width * height;
assert(picture != NULL);
WebPSafeFree(picture->memory_argb_);
WebPPictureResetBufferARGB(picture);
if (width <= 0 || height <= 0) {
return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
}
// allocate a new buffer.
memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
if (memory == NULL) {
return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
}
picture->memory_argb_ = memory;
picture->argb = (uint32_t*)WEBP_ALIGN(memory);
picture->argb_stride = width;
return 1;
}
int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
const WebPEncCSP uv_csp =
(WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
const int y_stride = width;
const int uv_width = (int)(((int64_t)width + 1) >> 1);
const int uv_height = (int)(((int64_t)height + 1) >> 1);
const int uv_stride = uv_width;
int a_width, a_stride;
uint64_t y_size, uv_size, a_size, total_size;
uint8_t* mem;
assert(picture != NULL);
WebPSafeFree(picture->memory_);
WebPPictureResetBufferYUVA(picture);
if (uv_csp != WEBP_YUV420) {
return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
}
// alpha
a_width = has_alpha ? width : 0;
a_stride = a_width;
y_size = (uint64_t)y_stride * height;
uv_size = (uint64_t)uv_stride * uv_height;
a_size = (uint64_t)a_stride * height;
total_size = y_size + a_size + 2 * uv_size;
// Security and validation checks
if (width <= 0 || height <= 0 || // luma/alpha param error
uv_width <= 0 || uv_height <= 0) { // u/v param error
return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
}
// allocate a new buffer.
mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
if (mem == NULL) {
return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
}
// From now on, we're in the clear, we can no longer fail...
picture->memory_ = (void*)mem;
picture->y_stride = y_stride;
picture->uv_stride = uv_stride;
picture->a_stride = a_stride;
// TODO(skal): we could align the y/u/v planes and adjust stride.
picture->y = mem;
mem += y_size;
picture->u = mem;
mem += uv_size;
picture->v = mem;
mem += uv_size;
if (a_size > 0) {
picture->a = mem;
mem += a_size;
}
(void)mem; // makes the static analyzer happy
return 1;
}
int WebPPictureAlloc(WebPPicture* picture) {
if (picture != NULL) {
const int width = picture->width;
const int height = picture->height;
WebPPictureFree(picture); // erase previous buffer
if (!picture->use_argb) {
return WebPPictureAllocYUVA(picture, width, height);
} else {
return WebPPictureAllocARGB(picture, width, height);
}
}
return 1;
}
void WebPPictureFree(WebPPicture* picture) {
if (picture != NULL) {
WebPSafeFree(picture->memory_);
WebPSafeFree(picture->memory_argb_);
WebPPictureResetBuffers(picture);
}
}
//------------------------------------------------------------------------------
// WebPMemoryWriter: Write-to-memory
void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
writer->mem = NULL;
writer->size = 0;
writer->max_size = 0;
}
int WebPMemoryWrite(const uint8_t* data, size_t data_size,
const WebPPicture* picture) {
WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
uint64_t next_size;
if (w == NULL) {
return 1;
}
next_size = (uint64_t)w->size + data_size;
if (next_size > w->max_size) {
uint8_t* new_mem;
uint64_t next_max_size = 2ULL * w->max_size;
if (next_max_size < next_size) next_max_size = next_size;
if (next_max_size < 8192ULL) next_max_size = 8192ULL;
new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
if (new_mem == NULL) {
return 0;
}
if (w->size > 0) {
memcpy(new_mem, w->mem, w->size);
}
WebPSafeFree(w->mem);
w->mem = new_mem;
// down-cast is ok, thanks to WebPSafeMalloc
w->max_size = (size_t)next_max_size;
}
if (data_size > 0) {
memcpy(w->mem + w->size, data, data_size);
w->size += data_size;
}
return 1;
}
void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
if (writer != NULL) {
WebPSafeFree(writer->mem);
writer->mem = NULL;
writer->size = 0;
writer->max_size = 0;
}
}
//------------------------------------------------------------------------------
// Simplest high-level calls:
typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
Importer import, float quality_factor, int lossless,
uint8_t** output) {
WebPPicture pic;
WebPConfig config;
WebPMemoryWriter wrt;
int ok;
if (output == NULL) return 0;
if (!WebPConfigPreset(&config, WEBP_PRESET_DEFAULT, quality_factor) ||
!WebPPictureInit(&pic)) {
return 0; // shouldn't happen, except if system installation is broken
}
config.lossless = !!lossless;
pic.use_argb = !!lossless;
pic.width = width;
pic.height = height;
pic.writer = WebPMemoryWrite;
pic.custom_ptr = &wrt;
WebPMemoryWriterInit(&wrt);
ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
WebPPictureFree(&pic);
if (!ok) {
WebPMemoryWriterClear(&wrt);
*output = NULL;
return 0;
}
*output = wrt.mem;
return wrt.size;
}
#define ENCODE_FUNC(NAME, IMPORTER) \
size_t NAME(const uint8_t* in, int w, int h, int bps, float q, \
uint8_t** out) { \
return Encode(in, w, h, bps, IMPORTER, q, 0, out); \
}
ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
#if !defined(WEBP_REDUCE_CSP)
ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
#endif // WEBP_REDUCE_CSP
#undef ENCODE_FUNC
#define LOSSLESS_DEFAULT_QUALITY 70.
#define LOSSLESS_ENCODE_FUNC(NAME, IMPORTER) \
size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) { \
return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out); \
}
LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
#if !defined(WEBP_REDUCE_CSP)
LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
#endif // WEBP_REDUCE_CSP
#undef LOSSLESS_ENCODE_FUNC
//------------------------------------------------------------------------------

View File

@ -0,0 +1,258 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools for measuring distortion
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/webp/encode.h"
#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
#include <math.h>
#include <stdlib.h>
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
const uint8_t* ref, int ref_stride,
int w, int h);
//------------------------------------------------------------------------------
// local-min distortion
//
// For every pixel in the *reference* picture, we search for the local best
// match in the compressed image. This is not a symmetrical measure.
#define RADIUS 2 // search radius. Shouldn't be too large.
static double AccumulateLSIM(const uint8_t* src, int src_stride,
const uint8_t* ref, int ref_stride,
int w, int h) {
int x, y;
double total_sse = 0.;
for (y = 0; y < h; ++y) {
const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
for (x = 0; x < w; ++x) {
const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
double best_sse = 255. * 255.;
const double value = (double)ref[y * ref_stride + x];
int i, j;
for (j = y_0; j < y_1; ++j) {
const uint8_t* const s = src + j * src_stride;
for (i = x_0; i < x_1; ++i) {
const double diff = s[i] - value;
const double sse = diff * diff;
if (sse < best_sse) best_sse = sse;
}
}
total_sse += best_sse;
}
}
return total_sse;
}
#undef RADIUS
static double AccumulateSSE(const uint8_t* src, int src_stride,
const uint8_t* ref, int ref_stride,
int w, int h) {
int y;
double total_sse = 0.;
for (y = 0; y < h; ++y) {
total_sse += VP8AccumulateSSE(src, ref, w);
src += src_stride;
ref += ref_stride;
}
return total_sse;
}
//------------------------------------------------------------------------------
static double AccumulateSSIM(const uint8_t* src, int src_stride,
const uint8_t* ref, int ref_stride,
int w, int h) {
const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
const int w1 = w - VP8_SSIM_KERNEL - 1;
const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
const int h1 = h - VP8_SSIM_KERNEL - 1;
int x, y;
double sum = 0.;
for (y = 0; y < h0; ++y) {
for (x = 0; x < w; ++x) {
sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
}
}
for (; y < h1; ++y) {
for (x = 0; x < w0; ++x) {
sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
}
for (; x < w1; ++x) {
const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
}
for (; x < w; ++x) {
sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
}
}
for (; y < h; ++y) {
for (x = 0; x < w; ++x) {
sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
}
}
return sum;
}
//------------------------------------------------------------------------------
// Distortion
// Max value returned in case of exact similarity.
static const double kMinDistortion_dB = 99.;
static double GetPSNR(double v, double size) {
return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
: kMinDistortion_dB;
}
static double GetLogSSIM(double v, double size) {
v = (size > 0.) ? v / size : 1.;
return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
}
int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
const uint8_t* ref, size_t ref_stride,
int width, int height, size_t x_step,
int type, float* distortion, float* result) {
uint8_t* allocated = NULL;
const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
(type == 1) ? AccumulateSSIM :
AccumulateLSIM;
if (src == NULL || ref == NULL ||
src_stride < x_step * width || ref_stride < x_step * width ||
result == NULL || distortion == NULL) {
return 0;
}
VP8SSIMDspInit();
if (x_step != 1) { // extract a packed plane if needed
int x, y;
uint8_t* tmp1;
uint8_t* tmp2;
allocated =
(uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
if (allocated == NULL) return 0;
tmp1 = allocated;
tmp2 = tmp1 + (size_t)width * height;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
tmp1[x + y * width] = src[x * x_step + y * src_stride];
tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
}
}
src = tmp1;
ref = tmp2;
}
*distortion = (float)metric(src, width, ref, width, width, height);
WebPSafeFree(allocated);
*result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
: (float)GetPSNR(*distortion, (double)width * height);
return 1;
}
#ifdef WORDS_BIGENDIAN
#define BLUE_OFFSET 3 // uint32_t 0x000000ff is 0x00,00,00,ff in memory
#else
#define BLUE_OFFSET 0 // uint32_t 0x000000ff is 0xff,00,00,00 in memory
#endif
int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
int type, float results[5]) {
int w, h, c;
int ok = 0;
WebPPicture p0, p1;
double total_size = 0., total_distortion = 0.;
if (src == NULL || ref == NULL ||
src->width != ref->width || src->height != ref->height ||
results == NULL) {
return 0;
}
VP8SSIMDspInit();
if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
w = src->width;
h = src->height;
if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
// We always measure distortion in ARGB space.
if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
for (c = 0; c < 4; ++c) {
float distortion;
const size_t stride0 = 4 * (size_t)p0.argb_stride;
const size_t stride1 = 4 * (size_t)p1.argb_stride;
// results are reported as BGRA
const int offset = c ^ BLUE_OFFSET;
if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
(const uint8_t*)p1.argb + offset, stride1,
w, h, 4, type, &distortion, results + c)) {
goto Error;
}
total_distortion += distortion;
total_size += w * h;
}
results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
: (float)GetPSNR(total_distortion, total_size);
ok = 1;
Error:
WebPPictureFree(&p0);
WebPPictureFree(&p1);
return ok;
}
#undef BLUE_OFFSET
#else // defined(WEBP_DISABLE_STATS)
int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
const uint8_t* ref, size_t ref_stride,
int width, int height, size_t x_step,
int type, float* distortion, float* result) {
(void)src;
(void)src_stride;
(void)ref;
(void)ref_stride;
(void)width;
(void)height;
(void)x_step;
(void)type;
if (distortion == NULL || result == NULL) return 0;
*distortion = 0.f;
*result = 0.f;
return 1;
}
int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
int type, float results[5]) {
int i;
(void)src;
(void)ref;
(void)type;
if (results == NULL) return 0;
for (i = 0; i < 5; ++i) results[i] = 0.f;
return 1;
}
#endif // !defined(WEBP_DISABLE_STATS)

View File

@ -0,0 +1,316 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools: copy, crop, rescaling and view.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/webp/encode.h"
#if !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include <stdlib.h>
#include "src/enc/vp8i_enc.h"
#include "src/utils/rescaler_utils.h"
#include "src/utils/utils.h"
#define HALVE(x) (((x) + 1) >> 1)
// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
// into 'dst'. Mark 'dst' as not owning any memory.
static void PictureGrabSpecs(const WebPPicture* const src,
WebPPicture* const dst) {
assert(src != NULL && dst != NULL);
*dst = *src;
WebPPictureResetBuffers(dst);
}
//------------------------------------------------------------------------------
// Adjust top-left corner to chroma sample position.
static void SnapTopLeftPosition(const WebPPicture* const pic,
int* const left, int* const top) {
if (!pic->use_argb) {
*left &= ~1;
*top &= ~1;
}
}
// Adjust top-left corner and verify that the sub-rectangle is valid.
static int AdjustAndCheckRectangle(const WebPPicture* const pic,
int* const left, int* const top,
int width, int height) {
SnapTopLeftPosition(pic, left, top);
if ((*left) < 0 || (*top) < 0) return 0;
if (width <= 0 || height <= 0) return 0;
if ((*left) + width > pic->width) return 0;
if ((*top) + height > pic->height) return 0;
return 1;
}
int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
if (src == NULL || dst == NULL) return 0;
if (src == dst) return 1;
PictureGrabSpecs(src, dst);
if (!WebPPictureAlloc(dst)) return 0;
if (!src->use_argb) {
WebPCopyPlane(src->y, src->y_stride,
dst->y, dst->y_stride, dst->width, dst->height);
WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
HALVE(dst->width), HALVE(dst->height));
WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
HALVE(dst->width), HALVE(dst->height));
if (dst->a != NULL) {
WebPCopyPlane(src->a, src->a_stride,
dst->a, dst->a_stride, dst->width, dst->height);
}
} else {
WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
(uint8_t*)dst->argb, 4 * dst->argb_stride,
4 * dst->width, dst->height);
}
return 1;
}
int WebPPictureIsView(const WebPPicture* picture) {
if (picture == NULL) return 0;
if (picture->use_argb) {
return (picture->memory_argb_ == NULL);
}
return (picture->memory_ == NULL);
}
int WebPPictureView(const WebPPicture* src,
int left, int top, int width, int height,
WebPPicture* dst) {
if (src == NULL || dst == NULL) return 0;
// verify rectangle position.
if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
if (src != dst) { // beware of aliasing! We don't want to leak 'memory_'.
PictureGrabSpecs(src, dst);
}
dst->width = width;
dst->height = height;
if (!src->use_argb) {
dst->y = src->y + top * src->y_stride + left;
dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
dst->y_stride = src->y_stride;
dst->uv_stride = src->uv_stride;
if (src->a != NULL) {
dst->a = src->a + top * src->a_stride + left;
dst->a_stride = src->a_stride;
}
} else {
dst->argb = src->argb + top * src->argb_stride + left;
dst->argb_stride = src->argb_stride;
}
return 1;
}
//------------------------------------------------------------------------------
// Picture cropping
int WebPPictureCrop(WebPPicture* pic,
int left, int top, int width, int height) {
WebPPicture tmp;
if (pic == NULL) return 0;
if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
PictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
if (!pic->use_argb) {
const int y_offset = top * pic->y_stride + left;
const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
WebPCopyPlane(pic->y + y_offset, pic->y_stride,
tmp.y, tmp.y_stride, width, height);
WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
if (tmp.a != NULL) {
const int a_offset = top * pic->a_stride + left;
WebPCopyPlane(pic->a + a_offset, pic->a_stride,
tmp.a, tmp.a_stride, width, height);
}
} else {
const uint8_t* const src =
(const uint8_t*)(pic->argb + top * pic->argb_stride + left);
WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
tmp.argb_stride * 4, width * 4, height);
}
WebPPictureFree(pic);
*pic = tmp;
return 1;
}
//------------------------------------------------------------------------------
// Simple picture rescaler
static int RescalePlane(const uint8_t* src,
int src_width, int src_height, int src_stride,
uint8_t* dst,
int dst_width, int dst_height, int dst_stride,
rescaler_t* const work,
int num_channels) {
WebPRescaler rescaler;
int y = 0;
if (!WebPRescalerInit(&rescaler, src_width, src_height,
dst, dst_width, dst_height, dst_stride,
num_channels, work)) {
return 0;
}
while (y < src_height) {
y += WebPRescalerImport(&rescaler, src_height - y,
src + y * src_stride, src_stride);
WebPRescalerExport(&rescaler);
}
return 1;
}
static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
assert(pic->argb != NULL);
WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
pic->width, pic->height, inverse);
}
static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
if (pic->a != NULL) {
WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
pic->width, pic->height, inverse);
}
}
int WebPPictureRescale(WebPPicture* pic, int width, int height) {
WebPPicture tmp;
int prev_width, prev_height;
rescaler_t* work;
if (pic == NULL) return 0;
prev_width = pic->width;
prev_height = pic->height;
if (!WebPRescalerGetScaledDimensions(
prev_width, prev_height, &width, &height)) {
return 0;
}
PictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
if (!pic->use_argb) {
work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
if (work == NULL) {
WebPPictureFree(&tmp);
return 0;
}
// If present, we need to rescale alpha first (for AlphaMultiplyY).
if (pic->a != NULL) {
WebPInitAlphaProcessing();
if (!RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
tmp.a, width, height, tmp.a_stride, work, 1)) {
return 0;
}
}
// We take transparency into account on the luma plane only. That's not
// totally exact blending, but still is a good approximation.
AlphaMultiplyY(pic, 0);
if (!RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
tmp.y, width, height, tmp.y_stride, work, 1) ||
!RescalePlane(pic->u,
HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
tmp.u,
HALVE(width), HALVE(height), tmp.uv_stride, work, 1) ||
!RescalePlane(pic->v,
HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
tmp.v,
HALVE(width), HALVE(height), tmp.uv_stride, work, 1)) {
return 0;
}
AlphaMultiplyY(&tmp, 1);
} else {
work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
if (work == NULL) {
WebPPictureFree(&tmp);
return 0;
}
// In order to correctly interpolate colors, we need to apply the alpha
// weighting first (black-matting), scale the RGB values, and remove
// the premultiplication afterward (while preserving the alpha channel).
WebPInitAlphaProcessing();
AlphaMultiplyARGB(pic, 0);
if (!RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
pic->argb_stride * 4,
(uint8_t*)tmp.argb, width, height,
tmp.argb_stride * 4, work, 4)) {
return 0;
}
AlphaMultiplyARGB(&tmp, 1);
}
WebPPictureFree(pic);
WebPSafeFree(work);
*pic = tmp;
return 1;
}
#else // defined(WEBP_REDUCE_SIZE)
int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
(void)src;
(void)dst;
return 0;
}
int WebPPictureIsView(const WebPPicture* picture) {
(void)picture;
return 0;
}
int WebPPictureView(const WebPPicture* src,
int left, int top, int width, int height,
WebPPicture* dst) {
(void)src;
(void)left;
(void)top;
(void)width;
(void)height;
(void)dst;
return 0;
}
int WebPPictureCrop(WebPPicture* pic,
int left, int top, int width, int height) {
(void)pic;
(void)left;
(void)top;
(void)width;
(void)height;
return 0;
}
int WebPPictureRescale(WebPPicture* pic, int width, int height) {
(void)pic;
(void)width;
(void)height;
return 0;
}
#endif // !defined(WEBP_REDUCE_SIZE)

View File

@ -0,0 +1,273 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebPPicture tools: alpha handling, etc.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "src/enc/vp8i_enc.h"
#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// Helper: clean up fully transparent area to help compressibility.
#define SIZE 8
#define SIZE2 (SIZE / 2)
static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
int y, x;
for (y = 0; y < size; ++y) {
for (x = 0; x < size; ++x) {
if (ptr[x] & 0xff000000u) {
return 0;
}
}
ptr += stride;
}
return 1;
}
static void Flatten(uint8_t* ptr, int v, int stride, int size) {
int y;
for (y = 0; y < size; ++y) {
memset(ptr, v, size);
ptr += stride;
}
}
static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
int x, y;
for (y = 0; y < size; ++y) {
for (x = 0; x < size; ++x) ptr[x] = v;
ptr += stride;
}
}
// Smoothen the luma components of transparent pixels. Return true if the whole
// block is transparent.
static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
int y_stride, int width, int height) {
int sum = 0, count = 0;
int x, y;
const uint8_t* alpha_ptr = a_ptr;
uint8_t* luma_ptr = y_ptr;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
if (alpha_ptr[x] != 0) {
++count;
sum += luma_ptr[x];
}
}
alpha_ptr += a_stride;
luma_ptr += y_stride;
}
if (count > 0 && count < width * height) {
const uint8_t avg_u8 = (uint8_t)(sum / count);
alpha_ptr = a_ptr;
luma_ptr = y_ptr;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
}
alpha_ptr += a_stride;
luma_ptr += y_stride;
}
}
return (count == 0);
}
void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
if (pic != NULL && pic->use_argb) {
int y = pic->height;
uint32_t* argb = pic->argb;
color &= 0xffffffu; // force alpha=0
WebPInitAlphaProcessing();
while (y-- > 0) {
WebPAlphaReplace(argb, pic->width, color);
argb += pic->argb_stride;
}
}
}
void WebPCleanupTransparentArea(WebPPicture* pic) {
int x, y, w, h;
if (pic == NULL) return;
w = pic->width / SIZE;
h = pic->height / SIZE;
// note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
if (pic->use_argb) {
uint32_t argb_value = 0;
for (y = 0; y < h; ++y) {
int need_reset = 1;
for (x = 0; x < w; ++x) {
const int off = (y * pic->argb_stride + x) * SIZE;
if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
if (need_reset) {
argb_value = pic->argb[off];
need_reset = 0;
}
FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
} else {
need_reset = 1;
}
}
}
} else {
const int width = pic->width;
const int height = pic->height;
const int y_stride = pic->y_stride;
const int uv_stride = pic->uv_stride;
const int a_stride = pic->a_stride;
uint8_t* y_ptr = pic->y;
uint8_t* u_ptr = pic->u;
uint8_t* v_ptr = pic->v;
const uint8_t* a_ptr = pic->a;
int values[3] = { 0 };
if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
return;
}
for (y = 0; y + SIZE <= height; y += SIZE) {
int need_reset = 1;
for (x = 0; x + SIZE <= width; x += SIZE) {
if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
SIZE, SIZE)) {
if (need_reset) {
values[0] = y_ptr[x];
values[1] = u_ptr[x >> 1];
values[2] = v_ptr[x >> 1];
need_reset = 0;
}
Flatten(y_ptr + x, values[0], y_stride, SIZE);
Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
} else {
need_reset = 1;
}
}
if (x < width) {
SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
width - x, SIZE);
}
a_ptr += SIZE * a_stride;
y_ptr += SIZE * y_stride;
u_ptr += SIZE2 * uv_stride;
v_ptr += SIZE2 * uv_stride;
}
if (y < height) {
const int sub_height = height - y;
for (x = 0; x + SIZE <= width; x += SIZE) {
SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
SIZE, sub_height);
}
if (x < width) {
SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
width - x, sub_height);
}
}
}
}
#undef SIZE
#undef SIZE2
//------------------------------------------------------------------------------
// Blend color and remove transparency info
#define BLEND(V0, V1, ALPHA) \
((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
#define BLEND_10BIT(V0, V1, ALPHA) \
((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
return (0xff000000u | (r << 16) | (g << 8) | b);
}
void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
const int red = (background_rgb >> 16) & 0xff;
const int green = (background_rgb >> 8) & 0xff;
const int blue = (background_rgb >> 0) & 0xff;
int x, y;
if (pic == NULL) return;
if (!pic->use_argb) {
const int uv_width = (pic->width >> 1); // omit last pixel during u/v loop
const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
// VP8RGBToU/V expects the u/v values summed over four pixels
const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
uint8_t* y_ptr = pic->y;
uint8_t* u_ptr = pic->u;
uint8_t* v_ptr = pic->v;
uint8_t* a_ptr = pic->a;
if (!has_alpha || a_ptr == NULL) return; // nothing to do
for (y = 0; y < pic->height; ++y) {
// Luma blending
for (x = 0; x < pic->width; ++x) {
const uint8_t alpha = a_ptr[x];
if (alpha < 0xff) {
y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
}
}
// Chroma blending every even line
if ((y & 1) == 0) {
uint8_t* const a_ptr2 =
(y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
for (x = 0; x < uv_width; ++x) {
// Average four alpha values into a single blending weight.
// TODO(skal): might lead to visible contouring. Can we do better?
const uint32_t alpha =
a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
}
if (pic->width & 1) { // rightmost pixel
const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
}
} else {
u_ptr += pic->uv_stride;
v_ptr += pic->uv_stride;
}
memset(a_ptr, 0xff, pic->width); // reset alpha value to opaque
a_ptr += pic->a_stride;
y_ptr += pic->y_stride;
}
} else {
uint32_t* argb = pic->argb;
const uint32_t background = MakeARGB32(red, green, blue);
for (y = 0; y < pic->height; ++y) {
for (x = 0; x < pic->width; ++x) {
const int alpha = (argb[x] >> 24) & 0xff;
if (alpha != 0xff) {
if (alpha > 0) {
int r = (argb[x] >> 16) & 0xff;
int g = (argb[x] >> 8) & 0xff;
int b = (argb[x] >> 0) & 0xff;
r = BLEND(red, r, alpha);
g = BLEND(green, g, alpha);
b = BLEND(blue, b, alpha);
argb[x] = MakeARGB32(r, g, b);
} else {
argb[x] = background;
}
}
}
argb += pic->argb_stride;
}
}
}
#undef BLEND
#undef BLEND_10BIT
//------------------------------------------------------------------------------

View File

@ -0,0 +1,772 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
// Vincent Rabaud (vrabaud@google.com)
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/enc/vp8li_enc.h"
#define MAX_DIFF_COST (1e30f)
static const float kSpatialPredictorBias = 15.f;
static const int kPredLowEffort = 11;
static const uint32_t kMaskAlpha = 0xff000000;
// Mostly used to reduce code size + readability
static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
//------------------------------------------------------------------------------
// Methods to calculate Entropy (Shannon).
static float PredictionCostSpatial(const int counts[256], int weight_0,
double exp_val) {
const int significant_symbols = 256 >> 4;
const double exp_decay_factor = 0.6;
double bits = weight_0 * counts[0];
int i;
for (i = 1; i < significant_symbols; ++i) {
bits += exp_val * (counts[i] + counts[256 - i]);
exp_val *= exp_decay_factor;
}
return (float)(-0.1 * bits);
}
static float PredictionCostSpatialHistogram(const int accumulated[4][256],
const int tile[4][256]) {
int i;
double retval = 0;
for (i = 0; i < 4; ++i) {
const double kExpValue = 0.94;
retval += PredictionCostSpatial(tile[i], 1, kExpValue);
retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
}
return (float)retval;
}
static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
++histo_argb[0][argb >> 24];
++histo_argb[1][(argb >> 16) & 0xff];
++histo_argb[2][(argb >> 8) & 0xff];
++histo_argb[3][argb & 0xff];
}
//------------------------------------------------------------------------------
// Spatial transform functions.
static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
int num_pixels, const uint32_t* current,
const uint32_t* upper, uint32_t* out) {
if (x_start == 0) {
if (y == 0) {
// ARGB_BLACK.
VP8LPredictorsSub[0](current, NULL, 1, out);
} else {
// Top one.
VP8LPredictorsSub[2](current, upper, 1, out);
}
++x_start;
++out;
--num_pixels;
}
if (y == 0) {
// Left one.
VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
} else {
VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
out);
}
}
#if (WEBP_NEAR_LOSSLESS == 1)
static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
}
static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
uint32_t left, uint32_t right) {
const int diff_up = MaxDiffBetweenPixels(current, up);
const int diff_down = MaxDiffBetweenPixels(current, down);
const int diff_left = MaxDiffBetweenPixels(current, left);
const int diff_right = MaxDiffBetweenPixels(current, right);
return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
}
static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
const uint32_t green = (argb >> 8) & 0xff;
uint32_t red_blue = argb & 0x00ff00ffu;
red_blue += (green << 16) | green;
red_blue &= 0x00ff00ffu;
return (argb & 0xff00ff00u) | red_blue;
}
static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
uint8_t* const max_diffs, int used_subtract_green) {
uint32_t current, up, down, left, right;
int x;
if (width <= 2) return;
current = argb[0];
right = argb[1];
if (used_subtract_green) {
current = AddGreenToBlueAndRed(current);
right = AddGreenToBlueAndRed(right);
}
// max_diffs[0] and max_diffs[width - 1] are never used.
for (x = 1; x < width - 1; ++x) {
up = argb[-stride + x];
down = argb[stride + x];
left = current;
current = right;
right = argb[x + 1];
if (used_subtract_green) {
up = AddGreenToBlueAndRed(up);
down = AddGreenToBlueAndRed(down);
right = AddGreenToBlueAndRed(right);
}
max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
}
}
// Quantize the difference between the actual component value and its prediction
// to a multiple of quantization, working modulo 256, taking care not to cross
// a boundary (inclusive upper limit).
static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
uint8_t boundary, int quantization) {
const int residual = (value - predict) & 0xff;
const int boundary_residual = (boundary - predict) & 0xff;
const int lower = residual & ~(quantization - 1);
const int upper = lower + quantization;
// Resolve ties towards a value closer to the prediction (i.e. towards lower
// if value comes after prediction and towards upper otherwise).
const int bias = ((boundary - value) & 0xff) < boundary_residual;
if (residual - lower < upper - residual + bias) {
// lower is closer to residual than upper.
if (residual > boundary_residual && lower <= boundary_residual) {
// Halve quantization step to avoid crossing boundary. This midpoint is
// on the same side of boundary as residual because midpoint >= residual
// (since lower is closer than upper) and residual is above the boundary.
return lower + (quantization >> 1);
}
return lower;
} else {
// upper is closer to residual than lower.
if (residual <= boundary_residual && upper > boundary_residual) {
// Halve quantization step to avoid crossing boundary. This midpoint is
// on the same side of boundary as residual because midpoint <= residual
// (since upper is closer than lower) and residual is below the boundary.
return lower + (quantization >> 1);
}
return upper & 0xff;
}
}
static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) {
return (uint8_t)((((int)(a) - (int)(b))) & 0xff);
}
// Quantize every component of the difference between the actual pixel value and
// its prediction to a multiple of a quantization (a power of 2, not larger than
// max_quantization which is a power of 2, smaller than max_diff). Take care if
// value and predict have undergone subtract green, which means that red and
// blue are represented as offsets from green.
static uint32_t NearLossless(uint32_t value, uint32_t predict,
int max_quantization, int max_diff,
int used_subtract_green) {
int quantization;
uint8_t new_green = 0;
uint8_t green_diff = 0;
uint8_t a, r, g, b;
if (max_diff <= 2) {
return VP8LSubPixels(value, predict);
}
quantization = max_quantization;
while (quantization >= max_diff) {
quantization >>= 1;
}
if ((value >> 24) == 0 || (value >> 24) == 0xff) {
// Preserve transparency of fully transparent or fully opaque pixels.
a = NearLosslessDiff((value >> 24) & 0xff, (predict >> 24) & 0xff);
} else {
a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
}
g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
quantization);
if (used_subtract_green) {
// The green offset will be added to red and blue components during decoding
// to obtain the actual red and blue values.
new_green = ((predict >> 8) + g) & 0xff;
// The amount by which green has been adjusted during quantization. It is
// subtracted from red and blue for compensation, to avoid accumulating two
// quantization errors in them.
green_diff = NearLosslessDiff(new_green, (value >> 8) & 0xff);
}
r = NearLosslessComponent(NearLosslessDiff((value >> 16) & 0xff, green_diff),
(predict >> 16) & 0xff, 0xff - new_green,
quantization);
b = NearLosslessComponent(NearLosslessDiff(value & 0xff, green_diff),
predict & 0xff, 0xff - new_green, quantization);
return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
}
#endif // (WEBP_NEAR_LOSSLESS == 1)
// Stores the difference between the pixel and its prediction in "out".
// In case of a lossy encoding, updates the source image to avoid propagating
// the deviation further to pixels which depend on the current pixel for their
// predictions.
static WEBP_INLINE void GetResidual(
int width, int height, uint32_t* const upper_row,
uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
int x_start, int x_end, int y, int max_quantization, int exact,
int used_subtract_green, uint32_t* const out) {
if (exact) {
PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
out);
} else {
const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
int x;
for (x = x_start; x < x_end; ++x) {
uint32_t predict;
uint32_t residual;
if (y == 0) {
predict = (x == 0) ? ARGB_BLACK : current_row[x - 1]; // Left.
} else if (x == 0) {
predict = upper_row[x]; // Top.
} else {
predict = pred_func(current_row[x - 1], upper_row + x);
}
#if (WEBP_NEAR_LOSSLESS == 1)
if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
x == 0 || x == width - 1) {
residual = VP8LSubPixels(current_row[x], predict);
} else {
residual = NearLossless(current_row[x], predict, max_quantization,
max_diffs[x], used_subtract_green);
// Update the source image.
current_row[x] = VP8LAddPixels(predict, residual);
// x is never 0 here so we do not need to update upper_row like below.
}
#else
(void)max_diffs;
(void)height;
(void)max_quantization;
(void)used_subtract_green;
residual = VP8LSubPixels(current_row[x], predict);
#endif
if ((current_row[x] & kMaskAlpha) == 0) {
// If alpha is 0, cleanup RGB. We can choose the RGB values of the
// residual for best compression. The prediction of alpha itself can be
// non-zero and must be kept though. We choose RGB of the residual to be
// 0.
residual &= kMaskAlpha;
// Update the source image.
current_row[x] = predict & ~kMaskAlpha;
// The prediction for the rightmost pixel in a row uses the leftmost
// pixel
// in that row as its top-right context pixel. Hence if we change the
// leftmost pixel of current_row, the corresponding change must be
// applied
// to upper_row as well where top-right context is being read from.
if (x == 0 && y != 0) upper_row[width] = current_row[0];
}
out[x - x_start] = residual;
}
}
}
// Returns best predictor and updates the accumulated histogram.
// If max_quantization > 1, assumes that near lossless processing will be
// applied, quantizing residuals to multiples of quantization levels up to
// max_quantization (the actual quantization level depends on smoothness near
// the given pixel).
static int GetBestPredictorForTile(int width, int height,
int tile_x, int tile_y, int bits,
int accumulated[4][256],
uint32_t* const argb_scratch,
const uint32_t* const argb,
int max_quantization,
int exact, int used_subtract_green,
const uint32_t* const modes) {
const int kNumPredModes = 14;
const int start_x = tile_x << bits;
const int start_y = tile_y << bits;
const int tile_size = 1 << bits;
const int max_y = GetMin(tile_size, height - start_y);
const int max_x = GetMin(tile_size, width - start_x);
// Whether there exist columns just outside the tile.
const int have_left = (start_x > 0);
// Position and size of the strip covering the tile and adjacent columns if
// they exist.
const int context_start_x = start_x - have_left;
#if (WEBP_NEAR_LOSSLESS == 1)
const int context_width = max_x + have_left + (max_x < width - start_x);
#endif
const int tiles_per_row = VP8LSubSampleSize(width, bits);
// Prediction modes of the left and above neighbor tiles.
const int left_mode = (tile_x > 0) ?
(modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
const int above_mode = (tile_y > 0) ?
(modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
// The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row
// when at the right edge.
uint32_t* upper_row = argb_scratch;
uint32_t* current_row = upper_row + width + 1;
uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
float best_diff = MAX_DIFF_COST;
int best_mode = 0;
int mode;
int histo_stack_1[4][256];
int histo_stack_2[4][256];
// Need pointers to be able to swap arrays.
int (*histo_argb)[256] = histo_stack_1;
int (*best_histo)[256] = histo_stack_2;
int i, j;
uint32_t residuals[1 << MAX_TRANSFORM_BITS];
assert(bits <= MAX_TRANSFORM_BITS);
assert(max_x <= (1 << MAX_TRANSFORM_BITS));
for (mode = 0; mode < kNumPredModes; ++mode) {
float cur_diff;
int relative_y;
memset(histo_argb, 0, sizeof(histo_stack_1));
if (start_y > 0) {
// Read the row above the tile which will become the first upper_row.
// Include a pixel to the left if it exists; include a pixel to the right
// in all cases (wrapping to the leftmost pixel of the next row if it does
// not exist).
memcpy(current_row + context_start_x,
argb + (start_y - 1) * width + context_start_x,
sizeof(*argb) * (max_x + have_left + 1));
}
for (relative_y = 0; relative_y < max_y; ++relative_y) {
const int y = start_y + relative_y;
int relative_x;
uint32_t* tmp = upper_row;
upper_row = current_row;
current_row = tmp;
// Read current_row. Include a pixel to the left if it exists; include a
// pixel to the right in all cases except at the bottom right corner of
// the image (wrapping to the leftmost pixel of the next row if it does
// not exist in the current row).
memcpy(current_row + context_start_x,
argb + y * width + context_start_x,
sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
#if (WEBP_NEAR_LOSSLESS == 1)
if (max_quantization > 1 && y >= 1 && y + 1 < height) {
MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
max_diffs + context_start_x, used_subtract_green);
}
#endif
GetResidual(width, height, upper_row, current_row, max_diffs, mode,
start_x, start_x + max_x, y, max_quantization, exact,
used_subtract_green, residuals);
for (relative_x = 0; relative_x < max_x; ++relative_x) {
UpdateHisto(histo_argb, residuals[relative_x]);
}
}
cur_diff = PredictionCostSpatialHistogram(
(const int (*)[256])accumulated, (const int (*)[256])histo_argb);
// Favor keeping the areas locally similar.
if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
if (cur_diff < best_diff) {
int (*tmp)[256] = histo_argb;
histo_argb = best_histo;
best_histo = tmp;
best_diff = cur_diff;
best_mode = mode;
}
}
for (i = 0; i < 4; i++) {
for (j = 0; j < 256; j++) {
accumulated[i][j] += best_histo[i][j];
}
}
return best_mode;
}
// Converts pixels of the image to residuals with respect to predictions.
// If max_quantization > 1, applies near lossless processing, quantizing
// residuals to multiples of quantization levels up to max_quantization
// (the actual quantization level depends on smoothness near the given pixel).
static void CopyImageWithPrediction(int width, int height,
int bits, uint32_t* const modes,
uint32_t* const argb_scratch,
uint32_t* const argb,
int low_effort, int max_quantization,
int exact, int used_subtract_green) {
const int tiles_per_row = VP8LSubSampleSize(width, bits);
// The width of upper_row and current_row is one pixel larger than image width
// to allow the top right pixel to point to the leftmost pixel of the next row
// when at the right edge.
uint32_t* upper_row = argb_scratch;
uint32_t* current_row = upper_row + width + 1;
uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
#if (WEBP_NEAR_LOSSLESS == 1)
uint8_t* lower_max_diffs = current_max_diffs + width;
#endif
int y;
for (y = 0; y < height; ++y) {
int x;
uint32_t* const tmp32 = upper_row;
upper_row = current_row;
current_row = tmp32;
memcpy(current_row, argb + y * width,
sizeof(*argb) * (width + (y + 1 < height)));
if (low_effort) {
PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
argb + y * width);
} else {
#if (WEBP_NEAR_LOSSLESS == 1)
if (max_quantization > 1) {
// Compute max_diffs for the lower row now, because that needs the
// contents of argb for the current row, which we will overwrite with
// residuals before proceeding with the next row.
uint8_t* const tmp8 = current_max_diffs;
current_max_diffs = lower_max_diffs;
lower_max_diffs = tmp8;
if (y + 2 < height) {
MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
used_subtract_green);
}
}
#endif
for (x = 0; x < width;) {
const int mode =
(modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
int x_end = x + (1 << bits);
if (x_end > width) x_end = width;
GetResidual(width, height, upper_row, current_row, current_max_diffs,
mode, x, x_end, y, max_quantization, exact,
used_subtract_green, argb + y * width + x);
x = x_end;
}
}
}
}
// Finds the best predictor for each tile, and converts the image to residuals
// with respect to predictions. If near_lossless_quality < 100, applies
// near lossless processing, shaving off more bits of residuals for lower
// qualities.
void VP8LResidualImage(int width, int height, int bits, int low_effort,
uint32_t* const argb, uint32_t* const argb_scratch,
uint32_t* const image, int near_lossless_quality,
int exact, int used_subtract_green) {
const int tiles_per_row = VP8LSubSampleSize(width, bits);
const int tiles_per_col = VP8LSubSampleSize(height, bits);
int tile_y;
int histo[4][256];
const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
if (low_effort) {
int i;
for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
image[i] = ARGB_BLACK | (kPredLowEffort << 8);
}
} else {
memset(histo, 0, sizeof(histo));
for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
int tile_x;
for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
bits, histo, argb_scratch, argb, max_quantization, exact,
used_subtract_green, image);
image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
}
}
}
CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
low_effort, max_quantization, exact,
used_subtract_green);
}
//------------------------------------------------------------------------------
// Color transform functions.
static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
m->green_to_red_ = 0;
m->green_to_blue_ = 0;
m->red_to_blue_ = 0;
}
static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
VP8LMultipliers* const m) {
m->green_to_red_ = (color_code >> 0) & 0xff;
m->green_to_blue_ = (color_code >> 8) & 0xff;
m->red_to_blue_ = (color_code >> 16) & 0xff;
}
static WEBP_INLINE uint32_t MultipliersToColorCode(
const VP8LMultipliers* const m) {
return 0xff000000u |
((uint32_t)(m->red_to_blue_) << 16) |
((uint32_t)(m->green_to_blue_) << 8) |
m->green_to_red_;
}
static float PredictionCostCrossColor(const int accumulated[256],
const int counts[256]) {
// Favor low entropy, locally and globally.
// Favor small absolute values for PredictionCostSpatial
static const double kExpValue = 2.4;
return VP8LCombinedShannonEntropy(counts, accumulated) +
PredictionCostSpatial(counts, 3, kExpValue);
}
static float GetPredictionCostCrossColorRed(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
const int accumulated_red_histo[256]) {
int histo[256] = { 0 };
float cur_diff;
VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
green_to_red, histo);
cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
if ((uint8_t)green_to_red == prev_x.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if ((uint8_t)green_to_red == prev_y.green_to_red_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if (green_to_red == 0) {
cur_diff -= 3;
}
return cur_diff;
}
static void GetBestGreenToRed(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
const int kMaxIters = 4 + ((7 * quality) >> 8); // in range [4..6]
int green_to_red_best = 0;
int iter, offset;
float best_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_red_best, accumulated_red_histo);
for (iter = 0; iter < kMaxIters; ++iter) {
// ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
// one in color computation. Having initial delta here as 1 is sufficient
// to explore the range of (-2, 2).
const int delta = 32 >> iter;
// Try a negative and a positive delta from the best known value.
for (offset = -delta; offset <= delta; offset += 2 * delta) {
const int green_to_red_cur = offset + green_to_red_best;
const float cur_diff = GetPredictionCostCrossColorRed(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_red_cur, accumulated_red_histo);
if (cur_diff < best_diff) {
best_diff = cur_diff;
green_to_red_best = green_to_red_cur;
}
}
}
best_tx->green_to_red_ = (green_to_red_best & 0xff);
}
static float GetPredictionCostCrossColorBlue(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y,
int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
int histo[256] = { 0 };
float cur_diff;
VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
green_to_blue, red_to_blue, histo);
cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
cur_diff -= 3; // favor keeping the areas locally similar
}
if (green_to_blue == 0) {
cur_diff -= 3;
}
if (red_to_blue == 0) {
cur_diff -= 3;
}
return cur_diff;
}
#define kGreenRedToBlueNumAxis 8
#define kGreenRedToBlueMaxIters 7
static void GetBestGreenRedToBlue(
const uint32_t* argb, int stride, int tile_width, int tile_height,
VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
const int accumulated_blue_histo[256],
VP8LMultipliers* const best_tx) {
const int8_t offset[kGreenRedToBlueNumAxis][2] =
{{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
const int iters =
(quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
int green_to_blue_best = 0;
int red_to_blue_best = 0;
int iter;
// Initial value at origin:
float best_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
for (iter = 0; iter < iters; ++iter) {
const int delta = delta_lut[iter];
int axis;
for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
const int green_to_blue_cur =
offset[axis][0] * delta + green_to_blue_best;
const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
const float cur_diff = GetPredictionCostCrossColorBlue(
argb, stride, tile_width, tile_height, prev_x, prev_y,
green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
if (cur_diff < best_diff) {
best_diff = cur_diff;
green_to_blue_best = green_to_blue_cur;
red_to_blue_best = red_to_blue_cur;
}
if (quality < 25 && iter == 4) {
// Only axis aligned diffs for lower quality.
break; // next iter.
}
}
if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
// Further iterations would not help.
break; // out of iter-loop.
}
}
best_tx->green_to_blue_ = green_to_blue_best & 0xff;
best_tx->red_to_blue_ = red_to_blue_best & 0xff;
}
#undef kGreenRedToBlueMaxIters
#undef kGreenRedToBlueNumAxis
static VP8LMultipliers GetBestColorTransformForTile(
int tile_x, int tile_y, int bits,
VP8LMultipliers prev_x,
VP8LMultipliers prev_y,
int quality, int xsize, int ysize,
const int accumulated_red_histo[256],
const int accumulated_blue_histo[256],
const uint32_t* const argb) {
const int max_tile_size = 1 << bits;
const int tile_y_offset = tile_y * max_tile_size;
const int tile_x_offset = tile_x * max_tile_size;
const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
const int tile_width = all_x_max - tile_x_offset;
const int tile_height = all_y_max - tile_y_offset;
const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+ tile_x_offset;
VP8LMultipliers best_tx;
MultipliersClear(&best_tx);
GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
prev_x, prev_y, quality, accumulated_blue_histo,
&best_tx);
return best_tx;
}
static void CopyTileWithColorTransform(int xsize, int ysize,
int tile_x, int tile_y,
int max_tile_size,
VP8LMultipliers color_transform,
uint32_t* argb) {
const int xscan = GetMin(max_tile_size, xsize - tile_x);
int yscan = GetMin(max_tile_size, ysize - tile_y);
argb += tile_y * xsize + tile_x;
while (yscan-- > 0) {
VP8LTransformColor(&color_transform, argb, xscan);
argb += xsize;
}
}
void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
uint32_t* const argb, uint32_t* image) {
const int max_tile_size = 1 << bits;
const int tile_xsize = VP8LSubSampleSize(width, bits);
const int tile_ysize = VP8LSubSampleSize(height, bits);
int accumulated_red_histo[256] = { 0 };
int accumulated_blue_histo[256] = { 0 };
int tile_x, tile_y;
VP8LMultipliers prev_x, prev_y;
MultipliersClear(&prev_y);
MultipliersClear(&prev_x);
for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
int y;
const int tile_x_offset = tile_x * max_tile_size;
const int tile_y_offset = tile_y * max_tile_size;
const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
const int offset = tile_y * tile_xsize + tile_x;
if (tile_y != 0) {
ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
}
prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
prev_x, prev_y,
quality, width, height,
accumulated_red_histo,
accumulated_blue_histo,
argb);
image[offset] = MultipliersToColorCode(&prev_x);
CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
max_tile_size, prev_x, argb);
// Gather accumulated histogram data.
for (y = tile_y_offset; y < all_y_max; ++y) {
int ix = y * width + tile_x_offset;
const int ix_end = ix + all_x_max - tile_x_offset;
for (; ix < ix_end; ++ix) {
const uint32_t pix = argb[ix];
if (ix >= 2 &&
pix == argb[ix - 2] &&
pix == argb[ix - 1]) {
continue; // repeated pixels are handled by backward references
}
if (ix >= width + 2 &&
argb[ix - 2] == argb[ix - width - 2] &&
argb[ix - 1] == argb[ix - width - 1] &&
pix == argb[ix - width]) {
continue; // repeated pixels are handled by backward references
}
++accumulated_red_histo[(pix >> 16) & 0xff];
++accumulated_blue_histo[(pix >> 0) & 0xff];
}
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,388 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Header syntax writing
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "src/utils/utils.h"
#include "src/webp/format_constants.h" // RIFF constants
#include "src/webp/mux_types.h" // ALPHA_FLAG
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Helper functions
static int IsVP8XNeeded(const VP8Encoder* const enc) {
return !!enc->has_alpha_; // Currently the only case when VP8X is needed.
// This could change in the future.
}
static int PutPaddingByte(const WebPPicture* const pic) {
const uint8_t pad_byte[1] = { 0 };
return !!pic->writer(pad_byte, 1, pic);
}
//------------------------------------------------------------------------------
// Writers for header's various pieces (in order of appearance)
static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
size_t riff_size) {
const WebPPicture* const pic = enc->pic_;
uint8_t riff[RIFF_HEADER_SIZE] = {
'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
};
assert(riff_size == (uint32_t)riff_size);
PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
if (!pic->writer(riff, sizeof(riff), pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
return VP8_ENC_OK;
}
static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
const WebPPicture* const pic = enc->pic_;
uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
'V', 'P', '8', 'X'
};
uint32_t flags = 0;
assert(IsVP8XNeeded(enc));
assert(pic->width >= 1 && pic->height >= 1);
assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
if (enc->has_alpha_) {
flags |= ALPHA_FLAG;
}
PutLE32(vp8x + TAG_SIZE, VP8X_CHUNK_SIZE);
PutLE32(vp8x + CHUNK_HEADER_SIZE, flags);
PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
return VP8_ENC_OK;
}
static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
const WebPPicture* const pic = enc->pic_;
uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
'A', 'L', 'P', 'H'
};
assert(enc->has_alpha_);
// Alpha chunk header.
PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
// Alpha chunk data.
if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
// Padding.
if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
return VP8_ENC_OK;
}
static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
size_t vp8_size) {
uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
'V', 'P', '8', ' '
};
assert(vp8_size == (uint32_t)vp8_size);
PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
return VP8_ENC_OK;
}
static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
int profile, size_t size0) {
uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
uint32_t bits;
if (size0 >= VP8_MAX_PARTITION0_SIZE) { // partition #0 is too big to fit
return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
}
// Paragraph 9.1.
bits = 0 // keyframe (1b)
| (profile << 1) // profile (3b)
| (1 << 4) // visible (1b)
| ((uint32_t)size0 << 5); // partition length (19b)
vp8_frm_hdr[0] = (bits >> 0) & 0xff;
vp8_frm_hdr[1] = (bits >> 8) & 0xff;
vp8_frm_hdr[2] = (bits >> 16) & 0xff;
// signature
vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
vp8_frm_hdr[4] = (VP8_SIGNATURE >> 8) & 0xff;
vp8_frm_hdr[5] = (VP8_SIGNATURE >> 0) & 0xff;
// dimensions
vp8_frm_hdr[6] = pic->width & 0xff;
vp8_frm_hdr[7] = pic->width >> 8;
vp8_frm_hdr[8] = pic->height & 0xff;
vp8_frm_hdr[9] = pic->height >> 8;
if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
return VP8_ENC_ERROR_BAD_WRITE;
}
return VP8_ENC_OK;
}
// WebP Headers.
static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
size_t vp8_size, size_t riff_size) {
WebPPicture* const pic = enc->pic_;
WebPEncodingError err = VP8_ENC_OK;
// RIFF header.
err = PutRIFFHeader(enc, riff_size);
if (err != VP8_ENC_OK) goto Error;
// VP8X.
if (IsVP8XNeeded(enc)) {
err = PutVP8XHeader(enc);
if (err != VP8_ENC_OK) goto Error;
}
// Alpha.
if (enc->has_alpha_) {
err = PutAlphaChunk(enc);
if (err != VP8_ENC_OK) goto Error;
}
// VP8 header.
err = PutVP8Header(pic, vp8_size);
if (err != VP8_ENC_OK) goto Error;
// VP8 frame header.
err = PutVP8FrameHeader(pic, enc->profile_, size0);
if (err != VP8_ENC_OK) goto Error;
// All OK.
return 1;
// Error.
Error:
return WebPEncodingSetError(pic, err);
}
// Segmentation header
static void PutSegmentHeader(VP8BitWriter* const bw,
const VP8Encoder* const enc) {
const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
const VP8EncProba* const proba = &enc->proba_;
if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
// We always 'update' the quant and filter strength values
const int update_data = 1;
int s;
VP8PutBitUniform(bw, hdr->update_map_);
if (VP8PutBitUniform(bw, update_data)) {
// we always use absolute values, not relative ones
VP8PutBitUniform(bw, 1); // (segment_feature_mode = 1. Paragraph 9.3.)
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
}
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
}
}
if (hdr->update_map_) {
for (s = 0; s < 3; ++s) {
if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
VP8PutBits(bw, proba->segments_[s], 8);
}
}
}
}
}
// Filtering parameters header
static void PutFilterHeader(VP8BitWriter* const bw,
const VP8EncFilterHeader* const hdr) {
const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
VP8PutBitUniform(bw, hdr->simple_);
VP8PutBits(bw, hdr->level_, 6);
VP8PutBits(bw, hdr->sharpness_, 3);
if (VP8PutBitUniform(bw, use_lf_delta)) {
// '0' is the default value for i4x4_lf_delta_ at frame #0.
const int need_update = (hdr->i4x4_lf_delta_ != 0);
if (VP8PutBitUniform(bw, need_update)) {
// we don't use ref_lf_delta => emit four 0 bits
VP8PutBits(bw, 0, 4);
// we use mode_lf_delta for i4x4
VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
VP8PutBits(bw, 0, 3); // all others unused
}
}
}
// Nominal quantization parameters
static void PutQuant(VP8BitWriter* const bw,
const VP8Encoder* const enc) {
VP8PutBits(bw, enc->base_quant_, 7);
VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
}
// Partition sizes
static int EmitPartitionsSize(const VP8Encoder* const enc,
WebPPicture* const pic) {
uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
int p;
for (p = 0; p < enc->num_parts_ - 1; ++p) {
const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
if (part_size >= VP8_MAX_PARTITION_SIZE) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
}
buf[3 * p + 0] = (part_size >> 0) & 0xff;
buf[3 * p + 1] = (part_size >> 8) & 0xff;
buf[3 * p + 2] = (part_size >> 16) & 0xff;
}
return p ? pic->writer(buf, 3 * p, pic) : 1;
}
//------------------------------------------------------------------------------
static int GeneratePartition0(VP8Encoder* const enc) {
VP8BitWriter* const bw = &enc->bw_;
const int mb_size = enc->mb_w_ * enc->mb_h_;
uint64_t pos1, pos2, pos3;
pos1 = VP8BitWriterPos(bw);
if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) { // ~7 bits per macroblock
return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
}
VP8PutBitUniform(bw, 0); // colorspace
VP8PutBitUniform(bw, 0); // clamp type
PutSegmentHeader(bw, enc);
PutFilterHeader(bw, &enc->filter_hdr_);
VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
enc->num_parts_ == 4 ? 2 :
enc->num_parts_ == 2 ? 1 : 0, 2);
PutQuant(bw, enc);
VP8PutBitUniform(bw, 0); // no proba update
VP8WriteProbas(bw, &enc->proba_);
pos2 = VP8BitWriterPos(bw);
VP8CodeIntraModes(enc);
VP8BitWriterFinish(bw);
pos3 = VP8BitWriterPos(bw);
#if !defined(WEBP_DISABLE_STATS)
if (enc->pic_->stats) {
enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
}
#else
(void)pos1;
(void)pos2;
(void)pos3;
#endif
if (bw->error_) {
return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
}
return 1;
}
void VP8EncFreeBitWriters(VP8Encoder* const enc) {
int p;
VP8BitWriterWipeOut(&enc->bw_);
for (p = 0; p < enc->num_parts_; ++p) {
VP8BitWriterWipeOut(enc->parts_ + p);
}
}
int VP8EncWrite(VP8Encoder* const enc) {
WebPPicture* const pic = enc->pic_;
VP8BitWriter* const bw = &enc->bw_;
const int task_percent = 19;
const int percent_per_part = task_percent / enc->num_parts_;
const int final_percent = enc->percent_ + task_percent;
int ok = 0;
size_t vp8_size, pad, riff_size;
int p;
// Partition #0 with header and partition sizes
ok = GeneratePartition0(enc);
if (!ok) return 0;
// Compute VP8 size
vp8_size = VP8_FRAME_HEADER_SIZE +
VP8BitWriterSize(bw) +
3 * (enc->num_parts_ - 1);
for (p = 0; p < enc->num_parts_; ++p) {
vp8_size += VP8BitWriterSize(enc->parts_ + p);
}
pad = vp8_size & 1;
vp8_size += pad;
// Compute RIFF size
// At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
if (IsVP8XNeeded(enc)) { // Add size for: VP8X header + data.
riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
}
if (enc->has_alpha_) { // Add size for: ALPH header + data.
const uint32_t padded_alpha_size = enc->alpha_data_size_ +
(enc->alpha_data_size_ & 1);
riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
}
// RIFF size should fit in 32-bits.
if (riff_size > 0xfffffffeU) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
}
// Emit headers and partition #0
{
const uint8_t* const part0 = VP8BitWriterBuf(bw);
const size_t size0 = VP8BitWriterSize(bw);
ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
&& pic->writer(part0, size0, pic)
&& EmitPartitionsSize(enc, pic);
VP8BitWriterWipeOut(bw); // will free the internal buffer.
}
// Token partitions
for (p = 0; p < enc->num_parts_; ++p) {
const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
const size_t size = VP8BitWriterSize(enc->parts_ + p);
if (size) ok = ok && pic->writer(buf, size, pic);
VP8BitWriterWipeOut(enc->parts_ + p); // will free the internal buffer.
ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
&enc->percent_);
}
// Padding byte
if (ok && pad) {
ok = PutPaddingByte(pic);
}
enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
return ok;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,262 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Paginated token buffer
//
// A 'token' is a bit value associated with a probability, either fixed
// or a later-to-be-determined after statistics have been collected.
// For dynamic probability, we just record the slot id (idx) for the probability
// value in the final probability array (uint8_t* probas in VP8EmitTokens).
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
#if !defined(DISABLE_TOKEN_BUFFER)
// we use pages to reduce the number of memcpy()
#define MIN_PAGE_SIZE 8192 // minimum number of token per page
#define FIXED_PROBA_BIT (1u << 14)
typedef uint16_t token_t; // bit #15: bit value
// bit #14: flags for constant proba or idx
// bits #0..13: slot or constant proba
struct VP8Tokens {
VP8Tokens* next_; // pointer to next page
};
// Token data is located in memory just after the next_ field.
// This macro is used to return their address and hide the trick.
#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
//------------------------------------------------------------------------------
void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
b->tokens_ = NULL;
b->pages_ = NULL;
b->last_page_ = &b->pages_;
b->left_ = 0;
b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
b->error_ = 0;
}
void VP8TBufferClear(VP8TBuffer* const b) {
if (b != NULL) {
VP8Tokens* p = b->pages_;
while (p != NULL) {
VP8Tokens* const next = p->next_;
WebPSafeFree(p);
p = next;
}
VP8TBufferInit(b, b->page_size_);
}
}
static int TBufferNewPage(VP8TBuffer* const b) {
VP8Tokens* page = NULL;
if (!b->error_) {
const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
}
if (page == NULL) {
b->error_ = 1;
return 0;
}
page->next_ = NULL;
*b->last_page_ = page;
b->last_page_ = &page->next_;
b->left_ = b->page_size_;
b->tokens_ = (token_t*)TOKEN_DATA(page);
return 1;
}
//------------------------------------------------------------------------------
#define TOKEN_ID(t, b, ctx) \
(NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
uint32_t proba_idx,
proba_t* const stats) {
assert(proba_idx < FIXED_PROBA_BIT);
assert(bit <= 1);
if (b->left_ > 0 || TBufferNewPage(b)) {
const int slot = --b->left_;
b->tokens_[slot] = (bit << 15) | proba_idx;
}
VP8RecordStats(bit, stats);
return bit;
}
static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
uint32_t bit, uint32_t proba) {
assert(proba < 256);
assert(bit <= 1);
if (b->left_ > 0 || TBufferNewPage(b)) {
const int slot = --b->left_;
b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
}
}
int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
VP8TBuffer* const tokens) {
const int16_t* const coeffs = res->coeffs;
const int coeff_type = res->coeff_type;
const int last = res->last;
int n = res->first;
uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
// should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
proba_t* s = res->stats[n][ctx];
if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
return 0;
}
while (n < 16) {
const int c = coeffs[n++];
const int sign = c < 0;
const uint32_t v = sign ? -c : c;
if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0); // ctx=0
s = res->stats[VP8EncBands[n]][0];
continue;
}
if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1); // ctx=1
s = res->stats[VP8EncBands[n]][1];
} else {
if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
AddToken(tokens, v == 4, base_id + 5, s + 5);
}
} else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
AddConstantToken(tokens, v == 6, 159);
} else {
AddConstantToken(tokens, v >= 9, 165);
AddConstantToken(tokens, !(v & 1), 145);
}
} else {
int mask;
const uint8_t* tab;
uint32_t residue = v - 3;
if (residue < (8 << 1)) { // VP8Cat3 (3b)
AddToken(tokens, 0, base_id + 8, s + 8);
AddToken(tokens, 0, base_id + 9, s + 9);
residue -= (8 << 0);
mask = 1 << 2;
tab = VP8Cat3;
} else if (residue < (8 << 2)) { // VP8Cat4 (4b)
AddToken(tokens, 0, base_id + 8, s + 8);
AddToken(tokens, 1, base_id + 9, s + 9);
residue -= (8 << 1);
mask = 1 << 3;
tab = VP8Cat4;
} else if (residue < (8 << 3)) { // VP8Cat5 (5b)
AddToken(tokens, 1, base_id + 8, s + 8);
AddToken(tokens, 0, base_id + 10, s + 9);
residue -= (8 << 2);
mask = 1 << 4;
tab = VP8Cat5;
} else { // VP8Cat6 (11b)
AddToken(tokens, 1, base_id + 8, s + 8);
AddToken(tokens, 1, base_id + 10, s + 9);
residue -= (8 << 3);
mask = 1 << 10;
tab = VP8Cat6;
}
while (mask) {
AddConstantToken(tokens, !!(residue & mask), *tab++);
mask >>= 1;
}
}
base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2); // ctx=2
s = res->stats[VP8EncBands[n]][2];
}
AddConstantToken(tokens, sign, 128);
if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
return 1; // EOB
}
}
return 1;
}
#undef TOKEN_ID
//------------------------------------------------------------------------------
// Final coding pass, with known probabilities
int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
const uint8_t* const probas, int final_pass) {
const VP8Tokens* p = b->pages_;
assert(!b->error_);
while (p != NULL) {
const VP8Tokens* const next = p->next_;
const int N = (next == NULL) ? b->left_ : 0;
int n = b->page_size_;
const token_t* const tokens = TOKEN_DATA(p);
while (n-- > N) {
const token_t token = tokens[n];
const int bit = (token >> 15) & 1;
if (token & FIXED_PROBA_BIT) {
VP8PutBit(bw, bit, token & 0xffu); // constant proba
} else {
VP8PutBit(bw, bit, probas[token & 0x3fffu]);
}
}
if (final_pass) WebPSafeFree((void*)p);
p = next;
}
if (final_pass) b->pages_ = NULL;
return 1;
}
// Size estimation
size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
size_t size = 0;
const VP8Tokens* p = b->pages_;
assert(!b->error_);
while (p != NULL) {
const VP8Tokens* const next = p->next_;
const int N = (next == NULL) ? b->left_ : 0;
int n = b->page_size_;
const token_t* const tokens = TOKEN_DATA(p);
while (n-- > N) {
const token_t token = tokens[n];
const int bit = token & (1 << 15);
if (token & FIXED_PROBA_BIT) {
size += VP8BitCost(bit, token & 0xffu);
} else {
size += VP8BitCost(bit, probas[token & 0x3fffu]);
}
}
p = next;
}
return size;
}
//------------------------------------------------------------------------------
#else // DISABLE_TOKEN_BUFFER
void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
(void)b;
(void)page_size;
}
void VP8TBufferClear(VP8TBuffer* const b) {
(void)b;
}
#endif // !DISABLE_TOKEN_BUFFER

View File

@ -0,0 +1,504 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Coding of token probabilities, intra modes and segments.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Default probabilities
// Paragraph 13.5
const uint8_t
VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
{ { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
{ 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
{ 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
},
{ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
{ 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
{ 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
},
{ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
{ 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
{ 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
},
{ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
{ 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
{ 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
},
{ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
{ 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
{ 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
},
{ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
{ 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
{ 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
},
{ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
{ 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
{ 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
},
{ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
{ 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
{ 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
},
{ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
{ 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
{ 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
},
{ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
{ 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
{ 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
},
{ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
{ 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
{ 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
},
{ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
{ 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
{ 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
},
{ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
{ 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
{ 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
},
{ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
}
},
{ { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
{ 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
{ 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
},
{ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
{ 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
{ 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
},
{ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
{ 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
{ 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
},
{ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
{ 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
{ 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
{ 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
},
{ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
{ 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
{ 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
},
{ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
{ 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
{ 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
},
{ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
{ 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
{ 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
},
{ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
{ 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
{ 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
},
{ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
{ 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
{ 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
},
{ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
{ 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
{ 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
},
{ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
}
}
};
void VP8DefaultProbas(VP8Encoder* const enc) {
VP8EncProba* const probas = &enc->proba_;
probas->use_skip_proba_ = 0;
memset(probas->segments_, 255u, sizeof(probas->segments_));
memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
// Note: we could hard-code the level_costs_ corresponding to VP8CoeffsProba0,
// but that's ~11k of static data. Better call VP8CalculateLevelCosts() later.
probas->dirty_ = 1;
}
// Paragraph 11.5. 900bytes.
static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
{ { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
{ 152, 179, 64, 126, 170, 118, 46, 70, 95 },
{ 175, 69, 143, 80, 85, 82, 72, 155, 103 },
{ 56, 58, 10, 171, 218, 189, 17, 13, 152 },
{ 114, 26, 17, 163, 44, 195, 21, 10, 173 },
{ 121, 24, 80, 195, 26, 62, 44, 64, 85 },
{ 144, 71, 10, 38, 171, 213, 144, 34, 26 },
{ 170, 46, 55, 19, 136, 160, 33, 206, 71 },
{ 63, 20, 8, 114, 114, 208, 12, 9, 226 },
{ 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
{ { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
{ 72, 187, 100, 130, 157, 111, 32, 75, 80 },
{ 66, 102, 167, 99, 74, 62, 40, 234, 128 },
{ 41, 53, 9, 178, 241, 141, 26, 8, 107 },
{ 74, 43, 26, 146, 73, 166, 49, 23, 157 },
{ 65, 38, 105, 160, 51, 52, 31, 115, 128 },
{ 104, 79, 12, 27, 217, 255, 87, 17, 7 },
{ 87, 68, 71, 44, 114, 51, 15, 186, 23 },
{ 47, 41, 14, 110, 182, 183, 21, 17, 194 },
{ 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
{ { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
{ 43, 97, 183, 117, 85, 38, 35, 179, 61 },
{ 39, 53, 200, 87, 26, 21, 43, 232, 171 },
{ 56, 34, 51, 104, 114, 102, 29, 93, 77 },
{ 39, 28, 85, 171, 58, 165, 90, 98, 64 },
{ 34, 22, 116, 206, 23, 34, 43, 166, 73 },
{ 107, 54, 32, 26, 51, 1, 81, 43, 31 },
{ 68, 25, 106, 22, 64, 171, 36, 225, 114 },
{ 34, 19, 21, 102, 132, 188, 16, 76, 124 },
{ 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
{ { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
{ 60, 148, 31, 172, 219, 228, 21, 18, 111 },
{ 112, 113, 77, 85, 179, 255, 38, 120, 114 },
{ 40, 42, 1, 196, 245, 209, 10, 25, 109 },
{ 88, 43, 29, 140, 166, 213, 37, 43, 154 },
{ 61, 63, 30, 155, 67, 45, 68, 1, 209 },
{ 100, 80, 8, 43, 154, 1, 51, 26, 71 },
{ 142, 78, 78, 16, 255, 128, 34, 197, 171 },
{ 41, 40, 5, 102, 211, 183, 4, 1, 221 },
{ 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
{ { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
{ 67, 87, 58, 169, 82, 115, 26, 59, 179 },
{ 63, 59, 90, 180, 59, 166, 93, 73, 154 },
{ 40, 40, 21, 116, 143, 209, 34, 39, 175 },
{ 47, 15, 16, 183, 34, 223, 49, 45, 183 },
{ 46, 17, 33, 183, 6, 98, 15, 32, 183 },
{ 57, 46, 22, 24, 128, 1, 54, 17, 37 },
{ 65, 32, 73, 115, 28, 128, 23, 128, 205 },
{ 40, 3, 9, 115, 51, 192, 18, 6, 223 },
{ 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
{ { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
{ 64, 90, 70, 205, 40, 41, 23, 26, 57 },
{ 54, 57, 112, 184, 5, 41, 38, 166, 213 },
{ 30, 34, 26, 133, 152, 116, 10, 32, 134 },
{ 39, 19, 53, 221, 26, 114, 32, 73, 255 },
{ 31, 9, 65, 234, 2, 15, 1, 118, 73 },
{ 75, 32, 12, 51, 192, 255, 160, 43, 51 },
{ 88, 31, 35, 67, 102, 85, 55, 186, 85 },
{ 56, 21, 23, 111, 59, 205, 45, 37, 192 },
{ 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
{ { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
{ 95, 84, 53, 89, 128, 100, 113, 101, 45 },
{ 75, 79, 123, 47, 51, 128, 81, 171, 1 },
{ 57, 17, 5, 71, 102, 57, 53, 41, 49 },
{ 38, 33, 13, 121, 57, 73, 26, 1, 85 },
{ 41, 10, 67, 138, 77, 110, 90, 47, 114 },
{ 115, 21, 2, 10, 102, 255, 166, 23, 6 },
{ 101, 29, 16, 10, 85, 128, 101, 196, 26 },
{ 57, 18, 10, 102, 102, 213, 34, 20, 43 },
{ 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
{ { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
{ 69, 60, 71, 38, 73, 119, 28, 222, 37 },
{ 68, 45, 128, 34, 1, 47, 11, 245, 171 },
{ 62, 17, 19, 70, 146, 85, 55, 62, 70 },
{ 37, 43, 37, 154, 100, 163, 85, 160, 1 },
{ 63, 9, 92, 136, 28, 64, 32, 201, 85 },
{ 75, 15, 9, 9, 64, 255, 184, 119, 16 },
{ 86, 6, 28, 5, 64, 255, 25, 248, 1 },
{ 56, 8, 17, 132, 137, 255, 55, 116, 128 },
{ 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
{ { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
{ 51, 103, 44, 131, 131, 123, 31, 6, 158 },
{ 86, 40, 64, 135, 148, 224, 45, 183, 128 },
{ 22, 26, 17, 131, 240, 154, 14, 1, 209 },
{ 45, 16, 21, 91, 64, 222, 7, 1, 197 },
{ 56, 21, 39, 155, 60, 138, 23, 102, 213 },
{ 83, 12, 13, 54, 192, 255, 68, 47, 28 },
{ 85, 26, 85, 85, 128, 128, 32, 146, 171 },
{ 18, 11, 7, 63, 144, 171, 4, 4, 246 },
{ 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
{ { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
{ 85, 126, 47, 87, 176, 51, 41, 20, 32 },
{ 101, 75, 128, 139, 118, 146, 116, 128, 85 },
{ 56, 41, 15, 176, 236, 85, 37, 9, 62 },
{ 71, 30, 17, 119, 118, 255, 17, 18, 138 },
{ 101, 38, 60, 138, 55, 70, 43, 26, 142 },
{ 146, 36, 19, 30, 171, 255, 97, 27, 20 },
{ 138, 45, 61, 62, 219, 1, 81, 188, 64 },
{ 32, 41, 20, 117, 151, 142, 20, 21, 163 },
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
};
static int PutI4Mode(VP8BitWriter* const bw, int mode,
const uint8_t* const prob) {
if (VP8PutBit(bw, mode != B_DC_PRED, prob[0])) {
if (VP8PutBit(bw, mode != B_TM_PRED, prob[1])) {
if (VP8PutBit(bw, mode != B_VE_PRED, prob[2])) {
if (!VP8PutBit(bw, mode >= B_LD_PRED, prob[3])) {
if (VP8PutBit(bw, mode != B_HE_PRED, prob[4])) {
VP8PutBit(bw, mode != B_RD_PRED, prob[5]);
}
} else {
if (VP8PutBit(bw, mode != B_LD_PRED, prob[6])) {
if (VP8PutBit(bw, mode != B_VL_PRED, prob[7])) {
VP8PutBit(bw, mode != B_HD_PRED, prob[8]);
}
}
}
}
}
}
return mode;
}
static void PutI16Mode(VP8BitWriter* const bw, int mode) {
if (VP8PutBit(bw, (mode == TM_PRED || mode == H_PRED), 156)) {
VP8PutBit(bw, mode == TM_PRED, 128); // TM or HE
} else {
VP8PutBit(bw, mode == V_PRED, 163); // VE or DC
}
}
static void PutUVMode(VP8BitWriter* const bw, int uv_mode) {
if (VP8PutBit(bw, uv_mode != DC_PRED, 142)) {
if (VP8PutBit(bw, uv_mode != V_PRED, 114)) {
VP8PutBit(bw, uv_mode != H_PRED, 183); // else: TM_PRED
}
}
}
static void PutSegment(VP8BitWriter* const bw, int s, const uint8_t* p) {
if (VP8PutBit(bw, s >= 2, p[0])) p += 1;
VP8PutBit(bw, s & 1, p[1]);
}
void VP8CodeIntraModes(VP8Encoder* const enc) {
VP8BitWriter* const bw = &enc->bw_;
VP8EncIterator it;
VP8IteratorInit(enc, &it);
do {
const VP8MBInfo* const mb = it.mb_;
const uint8_t* preds = it.preds_;
if (enc->segment_hdr_.update_map_) {
PutSegment(bw, mb->segment_, enc->proba_.segments_);
}
if (enc->proba_.use_skip_proba_) {
VP8PutBit(bw, mb->skip_, enc->proba_.skip_proba_);
}
if (VP8PutBit(bw, (mb->type_ != 0), 145)) { // i16x16
PutI16Mode(bw, preds[0]);
} else {
const int preds_w = enc->preds_w_;
const uint8_t* top_pred = preds - preds_w;
int x, y;
for (y = 0; y < 4; ++y) {
int left = preds[-1];
for (x = 0; x < 4; ++x) {
const uint8_t* const probas = kBModesProba[top_pred[x]][left];
left = PutI4Mode(bw, preds[x], probas);
}
top_pred = preds;
preds += preds_w;
}
}
PutUVMode(bw, mb->uv_mode_);
} while (VP8IteratorNext(&it));
}
//------------------------------------------------------------------------------
// Paragraph 13
const uint8_t
VP8CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
{ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
{ 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
},
{ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
{ 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
}
};
void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
int t, b, c, p;
for (t = 0; t < NUM_TYPES; ++t) {
for (b = 0; b < NUM_BANDS; ++b) {
for (c = 0; c < NUM_CTX; ++c) {
for (p = 0; p < NUM_PROBAS; ++p) {
const uint8_t p0 = probas->coeffs_[t][b][c][p];
const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
VP8PutBits(bw, p0, 8);
}
}
}
}
}
if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
VP8PutBits(bw, probas->skip_proba_, 8);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,410 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebP encoder: main entry point
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/vp8li_enc.h"
#include "src/utils/utils.h"
// #define PRINT_MEMORY_INFO
#ifdef PRINT_MEMORY_INFO
#include <stdio.h>
#endif
//------------------------------------------------------------------------------
int WebPGetEncoderVersion(void) {
return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
}
//------------------------------------------------------------------------------
// VP8Encoder
//------------------------------------------------------------------------------
static void ResetSegmentHeader(VP8Encoder* const enc) {
VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
hdr->num_segments_ = enc->config_->segments;
hdr->update_map_ = (hdr->num_segments_ > 1);
hdr->size_ = 0;
}
static void ResetFilterHeader(VP8Encoder* const enc) {
VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
hdr->simple_ = 1;
hdr->level_ = 0;
hdr->sharpness_ = 0;
hdr->i4x4_lf_delta_ = 0;
}
static void ResetBoundaryPredictions(VP8Encoder* const enc) {
// init boundary values once for all
// Note: actually, initializing the preds_[] is only needed for intra4.
int i;
uint8_t* const top = enc->preds_ - enc->preds_w_;
uint8_t* const left = enc->preds_ - 1;
for (i = -1; i < 4 * enc->mb_w_; ++i) {
top[i] = B_DC_PRED;
}
for (i = 0; i < 4 * enc->mb_h_; ++i) {
left[i * enc->preds_w_] = B_DC_PRED;
}
enc->nz_[-1] = 0; // constant
}
// Mapping from config->method_ to coding tools used.
//-------------------+---+---+---+---+---+---+---+
// Method | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
//-------------------+---+---+---+---+---+---+---+
// fast probe | x | | | x | | | |
//-------------------+---+---+---+---+---+---+---+
// dynamic proba | ~ | x | x | x | x | x | x |
//-------------------+---+---+---+---+---+---+---+
// fast mode analysis|[x]|[x]| | | x | x | x |
//-------------------+---+---+---+---+---+---+---+
// basic rd-opt | | | | x | x | x | x |
//-------------------+---+---+---+---+---+---+---+
// disto-refine i4/16| x | x | x | | | | |
//-------------------+---+---+---+---+---+---+---+
// disto-refine uv | | x | x | | | | |
//-------------------+---+---+---+---+---+---+---+
// rd-opt i4/16 | | | ~ | x | x | x | x |
//-------------------+---+---+---+---+---+---+---+
// token buffer (opt)| | | | x | x | x | x |
//-------------------+---+---+---+---+---+---+---+
// Trellis | | | | | | x |Ful|
//-------------------+---+---+---+---+---+---+---+
// full-SNS | | | | | x | x | x |
//-------------------+---+---+---+---+---+---+---+
static void MapConfigToTools(VP8Encoder* const enc) {
const WebPConfig* const config = enc->config_;
const int method = config->method;
const int limit = 100 - config->partition_limit;
enc->method_ = method;
enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
: (method >= 5) ? RD_OPT_TRELLIS
: (method >= 3) ? RD_OPT_BASIC
: RD_OPT_NONE;
enc->max_i4_header_bits_ =
256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block
(limit * limit) / (100 * 100); // ... modulated with a quadratic curve.
// partition0 = 512k max.
enc->mb_header_limit_ =
(score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
enc->thread_level_ = config->thread_level;
enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
if (!config->low_memory) {
#if !defined(DISABLE_TOKEN_BUFFER)
enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC); // need rd stats
#endif
if (enc->use_tokens_) {
enc->num_parts_ = 1; // doesn't work with multi-partition
}
}
}
// Memory scaling with dimensions:
// memory (bytes) ~= 2.25 * w + 0.0625 * w * h
//
// Typical memory footprint (614x440 picture)
// encoder: 22111
// info: 4368
// preds: 17741
// top samples: 1263
// non-zero: 175
// lf-stats: 0
// total: 45658
// Transient object sizes:
// VP8EncIterator: 3360
// VP8ModeScore: 872
// VP8SegmentInfo: 732
// VP8EncProba: 18352
// LFStats: 2048
// Picture size (yuv): 419328
static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
WebPPicture* const picture) {
VP8Encoder* enc;
const int use_filter =
(config->filter_strength > 0) || (config->autofilter > 0);
const int mb_w = (picture->width + 15) >> 4;
const int mb_h = (picture->height + 15) >> 4;
const int preds_w = 4 * mb_w + 1;
const int preds_h = 4 * mb_h + 1;
const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
const int top_stride = mb_w * 16;
const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
const size_t samples_size =
2 * top_stride * sizeof(*enc->y_top_) // top-luma/u/v
+ WEBP_ALIGN_CST; // align all
const size_t lf_stats_size =
config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
const size_t top_derr_size =
(config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
mb_w * sizeof(*enc->top_derr_) : 0;
uint8_t* mem;
const uint64_t size = (uint64_t)sizeof(*enc) // main struct
+ WEBP_ALIGN_CST // cache alignment
+ info_size // modes info
+ preds_size // prediction modes
+ samples_size // top/left samples
+ top_derr_size // top diffusion error
+ nz_size // coeff context bits
+ lf_stats_size; // autofilter stats
#ifdef PRINT_MEMORY_INFO
printf("===================================\n");
printf("Memory used:\n"
" encoder: %ld\n"
" info: %ld\n"
" preds: %ld\n"
" top samples: %ld\n"
" top diffusion: %ld\n"
" non-zero: %ld\n"
" lf-stats: %ld\n"
" total: %ld\n",
sizeof(*enc) + WEBP_ALIGN_CST, info_size,
preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
printf("Transient object sizes:\n"
" VP8EncIterator: %ld\n"
" VP8ModeScore: %ld\n"
" VP8SegmentInfo: %ld\n"
" VP8EncProba: %ld\n"
" LFStats: %ld\n",
sizeof(VP8EncIterator), sizeof(VP8ModeScore),
sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
sizeof(LFStats));
printf("Picture size (yuv): %ld\n",
mb_w * mb_h * 384 * sizeof(uint8_t));
printf("===================================\n");
#endif
mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
if (mem == NULL) {
WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
return NULL;
}
enc = (VP8Encoder*)mem;
mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
memset(enc, 0, sizeof(*enc));
enc->num_parts_ = 1 << config->partitions;
enc->mb_w_ = mb_w;
enc->mb_h_ = mb_h;
enc->preds_w_ = preds_w;
enc->mb_info_ = (VP8MBInfo*)mem;
mem += info_size;
enc->preds_ = mem + 1 + enc->preds_w_;
mem += preds_size;
enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
mem += nz_size;
enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
mem += lf_stats_size;
// top samples (all 16-aligned)
mem = (uint8_t*)WEBP_ALIGN(mem);
enc->y_top_ = mem;
enc->uv_top_ = enc->y_top_ + top_stride;
mem += 2 * top_stride;
enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
mem += top_derr_size;
assert(mem <= (uint8_t*)enc + size);
enc->config_ = config;
enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
enc->pic_ = picture;
enc->percent_ = 0;
MapConfigToTools(enc);
VP8EncDspInit();
VP8DefaultProbas(enc);
ResetSegmentHeader(enc);
ResetFilterHeader(enc);
ResetBoundaryPredictions(enc);
VP8EncDspCostInit();
VP8EncInitAlpha(enc);
// lower quality means smaller output -> we modulate a little the page
// size based on quality. This is just a crude 1rst-order prediction.
{
const float scale = 1.f + config->quality * 5.f / 100.f; // in [1,6]
VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
}
return enc;
}
static int DeleteVP8Encoder(VP8Encoder* enc) {
int ok = 1;
if (enc != NULL) {
ok = VP8EncDeleteAlpha(enc);
VP8TBufferClear(&enc->tokens_);
WebPSafeFree(enc);
}
return ok;
}
//------------------------------------------------------------------------------
#if !defined(WEBP_DISABLE_STATS)
static double GetPSNR(uint64_t err, uint64_t size) {
return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
}
static void FinalizePSNR(const VP8Encoder* const enc) {
WebPAuxStats* stats = enc->pic_->stats;
const uint64_t size = enc->sse_count_;
const uint64_t* const sse = enc->sse_;
stats->PSNR[0] = (float)GetPSNR(sse[0], size);
stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
stats->PSNR[4] = (float)GetPSNR(sse[3], size);
}
#endif // !defined(WEBP_DISABLE_STATS)
static void StoreStats(VP8Encoder* const enc) {
#if !defined(WEBP_DISABLE_STATS)
WebPAuxStats* const stats = enc->pic_->stats;
if (stats != NULL) {
int i, s;
for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
stats->segment_level[i] = enc->dqm_[i].fstrength_;
stats->segment_quant[i] = enc->dqm_[i].quant_;
for (s = 0; s <= 2; ++s) {
stats->residual_bytes[s][i] = enc->residual_bytes_[s][i];
}
}
FinalizePSNR(enc);
stats->coded_size = enc->coded_size_;
for (i = 0; i < 3; ++i) {
stats->block_count[i] = enc->block_count_[i];
}
}
#else // defined(WEBP_DISABLE_STATS)
WebPReportProgress(enc->pic_, 100, &enc->percent_); // done!
#endif // !defined(WEBP_DISABLE_STATS)
}
int WebPEncodingSetError(const WebPPicture* const pic,
WebPEncodingError error) {
assert((int)error < VP8_ENC_ERROR_LAST);
assert((int)error >= VP8_ENC_OK);
((WebPPicture*)pic)->error_code = error;
return 0;
}
int WebPReportProgress(const WebPPicture* const pic,
int percent, int* const percent_store) {
if (percent_store != NULL && percent != *percent_store) {
*percent_store = percent;
if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
// user abort requested
WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
return 0;
}
}
return 1; // ok
}
//------------------------------------------------------------------------------
int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
int ok = 0;
if (pic == NULL) return 0;
WebPEncodingSetError(pic, VP8_ENC_OK); // all ok so far
if (config == NULL) { // bad params
return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
}
if (!WebPValidateConfig(config)) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
}
if (pic->width <= 0 || pic->height <= 0) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
}
if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
}
if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
if (!config->lossless) {
VP8Encoder* enc = NULL;
if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
// Make sure we have YUVA samples.
if (config->use_sharp_yuv || (config->preprocessing & 4)) {
if (!WebPPictureSharpARGBToYUVA(pic)) {
return 0;
}
} else {
float dithering = 0.f;
if (config->preprocessing & 2) {
const float x = config->quality / 100.f;
const float x2 = x * x;
// slowly decreasing from max dithering at low quality (q->0)
// to 0.5 dithering amplitude at high quality (q->100)
dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
}
if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
return 0;
}
}
}
if (!config->exact) {
WebPCleanupTransparentArea(pic);
}
enc = InitVP8Encoder(config, pic);
if (enc == NULL) return 0; // pic->error is already set.
// Note: each of the tasks below account for 20% in the progress report.
ok = VP8EncAnalyze(enc);
// Analysis is done, proceed to actual coding.
ok = ok && VP8EncStartAlpha(enc); // possibly done in parallel
if (!enc->use_tokens_) {
ok = ok && VP8EncLoop(enc);
} else {
ok = ok && VP8EncTokenLoop(enc);
}
ok = ok && VP8EncFinishAlpha(enc);
ok = ok && VP8EncWrite(enc);
StoreStats(enc);
if (!ok) {
VP8EncFreeBitWriters(enc);
}
ok &= DeleteVP8Encoder(enc); // must always be called, even if !ok
} else {
// Make sure we have ARGB samples.
if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
return 0;
}
if (!config->exact) {
WebPReplaceTransparentPixels(pic, 0x000000);
}
ok = VP8LEncodeImage(config, pic); // Sets pic->error in case of problem.
}
return ok;
}

View File

@ -0,0 +1,347 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Bit writing and boolean coder
//
// Author: Skal (pascal.massimino@gmail.com)
// Vikas Arora (vikaas.arora@gmail.com)
#include <assert.h>
#include <string.h> // for memcpy()
#include <stdlib.h>
#include "src/utils/bit_writer_utils.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// VP8BitWriter
static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
uint8_t* new_buf;
size_t new_size;
const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
const size_t needed_size = (size_t)needed_size_64b;
if (needed_size_64b != needed_size) {
bw->error_ = 1;
return 0;
}
if (needed_size <= bw->max_pos_) return 1;
// If the following line wraps over 32bit, the test just after will catch it.
new_size = 2 * bw->max_pos_;
if (new_size < needed_size) new_size = needed_size;
if (new_size < 1024) new_size = 1024;
new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
if (new_buf == NULL) {
bw->error_ = 1;
return 0;
}
if (bw->pos_ > 0) {
assert(bw->buf_ != NULL);
memcpy(new_buf, bw->buf_, bw->pos_);
}
WebPSafeFree(bw->buf_);
bw->buf_ = new_buf;
bw->max_pos_ = new_size;
return 1;
}
static void Flush(VP8BitWriter* const bw) {
const int s = 8 + bw->nb_bits_;
const int32_t bits = bw->value_ >> s;
assert(bw->nb_bits_ >= 0);
bw->value_ -= bits << s;
bw->nb_bits_ -= 8;
if ((bits & 0xff) != 0xff) {
size_t pos = bw->pos_;
if (!BitWriterResize(bw, bw->run_ + 1)) {
return;
}
if (bits & 0x100) { // overflow -> propagate carry over pending 0xff's
if (pos > 0) bw->buf_[pos - 1]++;
}
if (bw->run_ > 0) {
const int value = (bits & 0x100) ? 0x00 : 0xff;
for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
}
bw->buf_[pos++] = bits & 0xff;
bw->pos_ = pos;
} else {
bw->run_++; // delay writing of bytes 0xff, pending eventual carry.
}
}
//------------------------------------------------------------------------------
// renormalization
static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i)
7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0
};
// range = ((range + 1) << kVP8Log2Range[range]) - 1
static const uint8_t kNewRange[128] = {
127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
241, 243, 245, 247, 249, 251, 253, 127
};
int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
const int split = (bw->range_ * prob) >> 8;
if (bit) {
bw->value_ += split + 1;
bw->range_ -= split + 1;
} else {
bw->range_ = split;
}
if (bw->range_ < 127) { // emit 'shift' bits out and renormalize
const int shift = kNorm[bw->range_];
bw->range_ = kNewRange[bw->range_];
bw->value_ <<= shift;
bw->nb_bits_ += shift;
if (bw->nb_bits_ > 0) Flush(bw);
}
return bit;
}
int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
const int split = bw->range_ >> 1;
if (bit) {
bw->value_ += split + 1;
bw->range_ -= split + 1;
} else {
bw->range_ = split;
}
if (bw->range_ < 127) {
bw->range_ = kNewRange[bw->range_];
bw->value_ <<= 1;
bw->nb_bits_ += 1;
if (bw->nb_bits_ > 0) Flush(bw);
}
return bit;
}
void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
uint32_t mask;
assert(nb_bits > 0 && nb_bits < 32);
for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
VP8PutBitUniform(bw, value & mask);
}
}
void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
if (!VP8PutBitUniform(bw, value != 0)) return;
if (value < 0) {
VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
} else {
VP8PutBits(bw, value << 1, nb_bits + 1);
}
}
//------------------------------------------------------------------------------
int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
bw->range_ = 255 - 1;
bw->value_ = 0;
bw->run_ = 0;
bw->nb_bits_ = -8;
bw->pos_ = 0;
bw->max_pos_ = 0;
bw->error_ = 0;
bw->buf_ = NULL;
return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
}
uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
VP8PutBits(bw, 0, 9 - bw->nb_bits_);
bw->nb_bits_ = 0; // pad with zeroes
Flush(bw);
return bw->buf_;
}
int VP8BitWriterAppend(VP8BitWriter* const bw,
const uint8_t* data, size_t size) {
assert(data != NULL);
if (bw->nb_bits_ != -8) return 0; // Flush() must have been called
if (!BitWriterResize(bw, size)) return 0;
memcpy(bw->buf_ + bw->pos_, data, size);
bw->pos_ += size;
return 1;
}
void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
if (bw != NULL) {
WebPSafeFree(bw->buf_);
memset(bw, 0, sizeof(*bw));
}
}
//------------------------------------------------------------------------------
// VP8LBitWriter
// This is the minimum amount of size the memory buffer is guaranteed to grow
// when extra space is needed.
#define MIN_EXTRA_SIZE (32768ULL)
// Returns 1 on success.
static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
uint8_t* allocated_buf;
size_t allocated_size;
const size_t max_bytes = bw->end_ - bw->buf_;
const size_t current_size = bw->cur_ - bw->buf_;
const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
const size_t size_required = (size_t)size_required_64b;
if (size_required != size_required_64b) {
bw->error_ = 1;
return 0;
}
if (max_bytes > 0 && size_required <= max_bytes) return 1;
allocated_size = (3 * max_bytes) >> 1;
if (allocated_size < size_required) allocated_size = size_required;
// make allocated size multiple of 1k
allocated_size = (((allocated_size >> 10) + 1) << 10);
allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
if (allocated_buf == NULL) {
bw->error_ = 1;
return 0;
}
if (current_size > 0) {
memcpy(allocated_buf, bw->buf_, current_size);
}
WebPSafeFree(bw->buf_);
bw->buf_ = allocated_buf;
bw->cur_ = bw->buf_ + current_size;
bw->end_ = bw->buf_ + allocated_size;
return 1;
}
int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
memset(bw, 0, sizeof(*bw));
return VP8LBitWriterResize(bw, expected_size);
}
int VP8LBitWriterClone(const VP8LBitWriter* const src,
VP8LBitWriter* const dst) {
const size_t current_size = src->cur_ - src->buf_;
assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
if (!VP8LBitWriterResize(dst, current_size)) return 0;
memcpy(dst->buf_, src->buf_, current_size);
dst->bits_ = src->bits_;
dst->used_ = src->used_;
dst->error_ = src->error_;
dst->cur_ = dst->buf_ + current_size;
return 1;
}
void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
if (bw != NULL) {
WebPSafeFree(bw->buf_);
memset(bw, 0, sizeof(*bw));
}
}
void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
VP8LBitWriter* const bw) {
bw->bits_ = bw_init->bits_;
bw->used_ = bw_init->used_;
bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
assert(bw->cur_ <= bw->end_);
bw->error_ = bw_init->error_;
}
void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
const VP8LBitWriter tmp = *src;
*src = *dst;
*dst = tmp;
}
void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
// If needed, make some room by flushing some bits out.
if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
if (!CheckSizeOverflow(extra_size) ||
!VP8LBitWriterResize(bw, (size_t)extra_size)) {
bw->cur_ = bw->buf_;
bw->error_ = 1;
return;
}
}
*(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
bw->cur_ += VP8L_WRITER_BYTES;
bw->bits_ >>= VP8L_WRITER_BITS;
bw->used_ -= VP8L_WRITER_BITS;
}
void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
assert(n_bits <= 32);
// That's the max we can handle:
assert(sizeof(vp8l_wtype_t) == 2);
if (n_bits > 0) {
vp8l_atype_t lbits = bw->bits_;
int used = bw->used_;
// Special case of overflow handling for 32bit accumulator (2-steps flush).
#if VP8L_WRITER_BITS == 16
if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
// Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
const int shift = VP8L_WRITER_MAX_BITS - used;
lbits |= (vp8l_atype_t)bits << used;
used = VP8L_WRITER_MAX_BITS;
n_bits -= shift;
bits >>= shift;
assert(n_bits <= VP8L_WRITER_MAX_BITS);
}
#endif
// If needed, make some room by flushing some bits out.
while (used >= VP8L_WRITER_BITS) {
if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
if (!CheckSizeOverflow(extra_size) ||
!VP8LBitWriterResize(bw, (size_t)extra_size)) {
bw->cur_ = bw->buf_;
bw->error_ = 1;
return;
}
}
*(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
bw->cur_ += VP8L_WRITER_BYTES;
lbits >>= VP8L_WRITER_BITS;
used -= VP8L_WRITER_BITS;
}
bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
bw->used_ = used + n_bits;
}
}
uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
// flush leftover bits
if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
while (bw->used_ > 0) {
*bw->cur_++ = (uint8_t)bw->bits_;
bw->bits_ >>= 8;
bw->used_ -= 8;
}
bw->used_ = 0;
}
return bw->buf_;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,416 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Jyrki Alakuijala (jyrki@google.com)
//
// Entropy encoding (Huffman) for webp lossless.
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "src/utils/huffman_encode_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
// -----------------------------------------------------------------------------
// Util function to optimize the symbol map for RLE coding
// Heuristics for selecting the stride ranges to collapse.
static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
return abs(a - b) < 4;
}
// Change the population counts in a way that the consequent
// Huffman tree compression, especially its RLE-part, give smaller output.
static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
uint32_t* const counts) {
// 1) Let's make the Huffman code more compatible with rle encoding.
int i;
for (; length >= 0; --length) {
if (length == 0) {
return; // All zeros.
}
if (counts[length - 1] != 0) {
// Now counts[0..length - 1] does not have trailing zeros.
break;
}
}
// 2) Let's mark all population counts that already can be encoded
// with an rle code.
{
// Let's not spoil any of the existing good rle codes.
// Mark any seq of 0's that is longer as 5 as a good_for_rle.
// Mark any seq of non-0's that is longer as 7 as a good_for_rle.
uint32_t symbol = counts[0];
int stride = 0;
for (i = 0; i < length + 1; ++i) {
if (i == length || counts[i] != symbol) {
if ((symbol == 0 && stride >= 5) ||
(symbol != 0 && stride >= 7)) {
int k;
for (k = 0; k < stride; ++k) {
good_for_rle[i - k - 1] = 1;
}
}
stride = 1;
if (i != length) {
symbol = counts[i];
}
} else {
++stride;
}
}
}
// 3) Let's replace those population counts that lead to more rle codes.
{
uint32_t stride = 0;
uint32_t limit = counts[0];
uint32_t sum = 0;
for (i = 0; i < length + 1; ++i) {
if (i == length || good_for_rle[i] ||
(i != 0 && good_for_rle[i - 1]) ||
!ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
if (stride >= 4 || (stride >= 3 && sum == 0)) {
uint32_t k;
// The stride must end, collapse what we have, if we have enough (4).
uint32_t count = (sum + stride / 2) / stride;
if (count < 1) {
count = 1;
}
if (sum == 0) {
// Don't make an all zeros stride to be upgraded to ones.
count = 0;
}
for (k = 0; k < stride; ++k) {
// We don't want to change value at counts[i],
// that is already belonging to the next stride. Thus - 1.
counts[i - k - 1] = count;
}
}
stride = 0;
sum = 0;
if (i < length - 3) {
// All interesting strides have a count of at least 4,
// at least when non-zeros.
limit = (counts[i] + counts[i + 1] +
counts[i + 2] + counts[i + 3] + 2) / 4;
} else if (i < length) {
limit = counts[i];
} else {
limit = 0;
}
}
++stride;
if (i != length) {
sum += counts[i];
if (stride >= 4) {
limit = (sum + stride / 2) / stride;
}
}
}
}
}
// A comparer function for two Huffman trees: sorts first by 'total count'
// (more comes first), and then by 'value' (more comes first).
static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
const HuffmanTree* const t1 = (const HuffmanTree*)ptr1;
const HuffmanTree* const t2 = (const HuffmanTree*)ptr2;
if (t1->total_count_ > t2->total_count_) {
return -1;
} else if (t1->total_count_ < t2->total_count_) {
return 1;
} else {
assert(t1->value_ != t2->value_);
return (t1->value_ < t2->value_) ? -1 : 1;
}
}
static void SetBitDepths(const HuffmanTree* const tree,
const HuffmanTree* const pool,
uint8_t* const bit_depths, int level) {
if (tree->pool_index_left_ >= 0) {
SetBitDepths(&pool[tree->pool_index_left_], pool, bit_depths, level + 1);
SetBitDepths(&pool[tree->pool_index_right_], pool, bit_depths, level + 1);
} else {
bit_depths[tree->value_] = level;
}
}
// Create an optimal Huffman tree.
//
// (data,length): population counts.
// tree_limit: maximum bit depth (inclusive) of the codes.
// bit_depths[]: how many bits are used for the symbol.
//
// Returns 0 when an error has occurred.
//
// The catch here is that the tree cannot be arbitrarily deep
//
// count_limit is the value that is to be faked as the minimum value
// and this minimum value is raised until the tree matches the
// maximum length requirement.
//
// This algorithm is not of excellent performance for very long data blocks,
// especially when population counts are longer than 2**tree_limit, but
// we are not planning to use this with extremely long blocks.
//
// See http://en.wikipedia.org/wiki/Huffman_coding
static void GenerateOptimalTree(const uint32_t* const histogram,
int histogram_size,
HuffmanTree* tree, int tree_depth_limit,
uint8_t* const bit_depths) {
uint32_t count_min;
HuffmanTree* tree_pool;
int tree_size_orig = 0;
int i;
for (i = 0; i < histogram_size; ++i) {
if (histogram[i] != 0) {
++tree_size_orig;
}
}
if (tree_size_orig == 0) { // pretty optimal already!
return;
}
tree_pool = tree + tree_size_orig;
// For block sizes with less than 64k symbols we never need to do a
// second iteration of this loop.
// If we actually start running inside this loop a lot, we would perhaps
// be better off with the Katajainen algorithm.
assert(tree_size_orig <= (1 << (tree_depth_limit - 1)));
for (count_min = 1; ; count_min *= 2) {
int tree_size = tree_size_orig;
// We need to pack the Huffman tree in tree_depth_limit bits.
// So, we try by faking histogram entries to be at least 'count_min'.
int idx = 0;
int j;
for (j = 0; j < histogram_size; ++j) {
if (histogram[j] != 0) {
const uint32_t count =
(histogram[j] < count_min) ? count_min : histogram[j];
tree[idx].total_count_ = count;
tree[idx].value_ = j;
tree[idx].pool_index_left_ = -1;
tree[idx].pool_index_right_ = -1;
++idx;
}
}
// Build the Huffman tree.
qsort(tree, tree_size, sizeof(*tree), CompareHuffmanTrees);
if (tree_size > 1) { // Normal case.
int tree_pool_size = 0;
while (tree_size > 1) { // Finish when we have only one root.
uint32_t count;
tree_pool[tree_pool_size++] = tree[tree_size - 1];
tree_pool[tree_pool_size++] = tree[tree_size - 2];
count = tree_pool[tree_pool_size - 1].total_count_ +
tree_pool[tree_pool_size - 2].total_count_;
tree_size -= 2;
{
// Search for the insertion point.
int k;
for (k = 0; k < tree_size; ++k) {
if (tree[k].total_count_ <= count) {
break;
}
}
memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
tree[k].total_count_ = count;
tree[k].value_ = -1;
tree[k].pool_index_left_ = tree_pool_size - 1;
tree[k].pool_index_right_ = tree_pool_size - 2;
tree_size = tree_size + 1;
}
}
SetBitDepths(&tree[0], tree_pool, bit_depths, 0);
} else if (tree_size == 1) { // Trivial case: only one element.
bit_depths[tree[0].value_] = 1;
}
{
// Test if this Huffman tree satisfies our 'tree_depth_limit' criteria.
int max_depth = bit_depths[0];
for (j = 1; j < histogram_size; ++j) {
if (max_depth < bit_depths[j]) {
max_depth = bit_depths[j];
}
}
if (max_depth <= tree_depth_limit) {
break;
}
}
}
}
// -----------------------------------------------------------------------------
// Coding of the Huffman tree values
static HuffmanTreeToken* CodeRepeatedValues(int repetitions,
HuffmanTreeToken* tokens,
int value, int prev_value) {
assert(value <= MAX_ALLOWED_CODE_LENGTH);
if (value != prev_value) {
tokens->code = value;
tokens->extra_bits = 0;
++tokens;
--repetitions;
}
while (repetitions >= 1) {
if (repetitions < 3) {
int i;
for (i = 0; i < repetitions; ++i) {
tokens->code = value;
tokens->extra_bits = 0;
++tokens;
}
break;
} else if (repetitions < 7) {
tokens->code = 16;
tokens->extra_bits = repetitions - 3;
++tokens;
break;
} else {
tokens->code = 16;
tokens->extra_bits = 3;
++tokens;
repetitions -= 6;
}
}
return tokens;
}
static HuffmanTreeToken* CodeRepeatedZeros(int repetitions,
HuffmanTreeToken* tokens) {
while (repetitions >= 1) {
if (repetitions < 3) {
int i;
for (i = 0; i < repetitions; ++i) {
tokens->code = 0; // 0-value
tokens->extra_bits = 0;
++tokens;
}
break;
} else if (repetitions < 11) {
tokens->code = 17;
tokens->extra_bits = repetitions - 3;
++tokens;
break;
} else if (repetitions < 139) {
tokens->code = 18;
tokens->extra_bits = repetitions - 11;
++tokens;
break;
} else {
tokens->code = 18;
tokens->extra_bits = 0x7f; // 138 repeated 0s
++tokens;
repetitions -= 138;
}
}
return tokens;
}
int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
HuffmanTreeToken* tokens, int max_tokens) {
HuffmanTreeToken* const starting_token = tokens;
HuffmanTreeToken* const ending_token = tokens + max_tokens;
const int depth_size = tree->num_symbols;
int prev_value = 8; // 8 is the initial value for rle.
int i = 0;
assert(tokens != NULL);
while (i < depth_size) {
const int value = tree->code_lengths[i];
int k = i + 1;
int runs;
while (k < depth_size && tree->code_lengths[k] == value) ++k;
runs = k - i;
if (value == 0) {
tokens = CodeRepeatedZeros(runs, tokens);
} else {
tokens = CodeRepeatedValues(runs, tokens, value, prev_value);
prev_value = value;
}
i += runs;
assert(tokens <= ending_token);
}
(void)ending_token; // suppress 'unused variable' warning
return (int)(tokens - starting_token);
}
// -----------------------------------------------------------------------------
// Pre-reversed 4-bit values.
static const uint8_t kReversedBits[16] = {
0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
};
static uint32_t ReverseBits(int num_bits, uint32_t bits) {
uint32_t retval = 0;
int i = 0;
while (i < num_bits) {
i += 4;
retval |= kReversedBits[bits & 0xf] << (MAX_ALLOWED_CODE_LENGTH + 1 - i);
bits >>= 4;
}
retval >>= (MAX_ALLOWED_CODE_LENGTH + 1 - num_bits);
return retval;
}
// Get the actual bit values for a tree of bit depths.
static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
// 0 bit-depth means that the symbol does not exist.
int i;
int len;
uint32_t next_code[MAX_ALLOWED_CODE_LENGTH + 1];
int depth_count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
assert(tree != NULL);
len = tree->num_symbols;
for (i = 0; i < len; ++i) {
const int code_length = tree->code_lengths[i];
assert(code_length <= MAX_ALLOWED_CODE_LENGTH);
++depth_count[code_length];
}
depth_count[0] = 0; // ignore unused symbol
next_code[0] = 0;
{
uint32_t code = 0;
for (i = 1; i <= MAX_ALLOWED_CODE_LENGTH; ++i) {
code = (code + depth_count[i - 1]) << 1;
next_code[i] = code;
}
}
for (i = 0; i < len; ++i) {
const int code_length = tree->code_lengths[i];
tree->codes[i] = ReverseBits(code_length, next_code[code_length]++);
}
}
// -----------------------------------------------------------------------------
// Main entry point
void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
uint8_t* const buf_rle, HuffmanTree* const huff_tree,
HuffmanTreeCode* const huff_code) {
const int num_symbols = huff_code->num_symbols;
memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
huff_code->code_lengths);
// Create the actual bit codes for the bit lengths.
ConvertBitDepthsToSymbols(huff_code);
}

View File

@ -9,8 +9,10 @@ with Files('**'):
SOURCES += [
'bit_reader_utils.c',
'bit_writer_utils.c',
'color_cache_utils.c',
'filters_utils.c',
'huffman_encode_utils.c',
'huffman_utils.c',
'quant_levels_dec_utils.c',
'quant_levels_utils.c',

View File

@ -21,77 +21,20 @@ cp $1/src/webp/*.h src/webp
mkdir -p src/dec
cp $1/src/dec/*.h src/dec
cp $1/src/dec/alpha_dec.c src/dec
cp $1/src/dec/buffer_dec.c src/dec
cp $1/src/dec/frame_dec.c src/dec
cp $1/src/dec/idec_dec.c src/dec
cp $1/src/dec/io_dec.c src/dec
cp $1/src/dec/quant_dec.c src/dec
cp $1/src/dec/tree_dec.c src/dec
cp $1/src/dec/vp8_dec.c src/dec
cp $1/src/dec/vp8l_dec.c src/dec
cp $1/src/dec/webp_dec.c src/dec
cp $1/src/dec/*.c src/dec
mkdir -p src/demux
cp $1/src/demux/demux.c src/demux
mkdir -p src/dsp
cp $1/src/dsp/*.h src/dsp
cp $1/src/dsp/alpha_processing.c src/dsp
cp $1/src/dsp/alpha_processing_mips_dsp_r2.c src/dsp
cp $1/src/dsp/alpha_processing_neon.c src/dsp
cp $1/src/dsp/alpha_processing_sse2.c src/dsp
cp $1/src/dsp/alpha_processing_sse41.c src/dsp
cp $1/src/dsp/dec.c src/dsp
cp $1/src/dsp/dec_clip_tables.c src/dsp
cp $1/src/dsp/dec_mips32.c src/dsp
cp $1/src/dsp/dec_mips_dsp_r2.c src/dsp
cp $1/src/dsp/dec_msa.c src/dsp
cp $1/src/dsp/dec_neon.c src/dsp
cp $1/src/dsp/dec_sse2.c src/dsp
cp $1/src/dsp/dec_sse41.c src/dsp
cp $1/src/dsp/filters.c src/dsp
cp $1/src/dsp/filters_mips_dsp_r2.c src/dsp
cp $1/src/dsp/filters_msa.c src/dsp
cp $1/src/dsp/filters_neon.c src/dsp
cp $1/src/dsp/filters_sse2.c src/dsp
cp $1/src/dsp/lossless.c src/dsp
cp $1/src/dsp/lossless_mips_dsp_r2.c src/dsp
cp $1/src/dsp/lossless_msa.c src/dsp
cp $1/src/dsp/lossless_neon.c src/dsp
cp $1/src/dsp/lossless_sse2.c src/dsp
cp $1/src/dsp/lossless_sse41.c src/dsp
cp $1/src/dsp/rescaler.c src/dsp
cp $1/src/dsp/rescaler_mips32.c src/dsp
cp $1/src/dsp/rescaler_mips_dsp_r2.c src/dsp
cp $1/src/dsp/rescaler_msa.c src/dsp
cp $1/src/dsp/rescaler_neon.c src/dsp
cp $1/src/dsp/rescaler_sse2.c src/dsp
cp $1/src/dsp/upsampling.c src/dsp
cp $1/src/dsp/upsampling_mips_dsp_r2.c src/dsp
cp $1/src/dsp/upsampling_msa.c src/dsp
cp $1/src/dsp/upsampling_neon.c src/dsp
cp $1/src/dsp/upsampling_sse2.c src/dsp
cp $1/src/dsp/upsampling_sse41.c src/dsp
cp $1/src/dsp/yuv.c src/dsp
cp $1/src/dsp/yuv_mips32.c src/dsp
cp $1/src/dsp/yuv_mips_dsp_r2.c src/dsp
cp $1/src/dsp/yuv_neon.c src/dsp
cp $1/src/dsp/yuv_sse2.c src/dsp
cp $1/src/dsp/yuv_sse41.c src/dsp
cp $1/src/dsp/*.c src/dsp
rm src/dsp/cpu.c
mkdir -p src/enc
cp $1/src/enc/*.h src/enc
cp $1/src/enc/*.c src/enc
mkdir -p src/utils
cp $1/src/utils/*.h src/utils
cp $1/src/utils/bit_reader_utils.c src/utils
cp $1/src/utils/color_cache_utils.c src/utils
cp $1/src/utils/filters_utils.c src/utils
cp $1/src/utils/huffman_utils.c src/utils
cp $1/src/utils/quant_levels_dec_utils.c src/utils
cp $1/src/utils/quant_levels_utils.c src/utils
cp $1/src/utils/random_utils.c src/utils
cp $1/src/utils/rescaler_utils.c src/utils
cp $1/src/utils/thread_utils.c src/utils
cp $1/src/utils/utils.c src/utils
cp $1/src/utils/*.c src/utils