darling-WebCore/contentextensions/DFAMinimizer.cpp
2023-01-24 16:42:21 -08:00

510 lines
20 KiB
C++

/*
* Copyright (C) 2015 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "DFAMinimizer.h"
#if ENABLE(CONTENT_EXTENSIONS)
#include "DFA.h"
#include "DFANode.h"
#include "MutableRangeList.h"
#include <wtf/HashMap.h>
#include <wtf/Hasher.h>
#include <wtf/Vector.h>
namespace WebCore {
namespace ContentExtensions {
namespace {
template<typename VectorType, typename Iterable, typename Function>
static inline void iterateIntersections(const VectorType& singularTransitionsFirsts, const Iterable& iterableTransitionList, const Function& intersectionHandler)
{
ASSERT(!singularTransitionsFirsts.isEmpty());
auto otherIterator = iterableTransitionList.begin();
auto otherEnd = iterableTransitionList.end();
if (otherIterator == otherEnd)
return;
unsigned singularTransitionsLength = singularTransitionsFirsts.size();
unsigned singularTransitionsFirstsIndex = 0;
for (; otherIterator != otherEnd; ++otherIterator) {
auto firstCharacter = otherIterator.first();
while (singularTransitionsFirstsIndex < singularTransitionsLength
&& singularTransitionsFirsts[singularTransitionsFirstsIndex] != firstCharacter)
++singularTransitionsFirstsIndex;
intersectionHandler(singularTransitionsFirstsIndex, otherIterator);
++singularTransitionsFirstsIndex;
auto lastCharacter = otherIterator.last();
while (singularTransitionsFirstsIndex < singularTransitionsLength
&& singularTransitionsFirsts[singularTransitionsFirstsIndex] <= lastCharacter) {
intersectionHandler(singularTransitionsFirstsIndex, otherIterator);
++singularTransitionsFirstsIndex;
}
}
}
class Partition {
public:
void initialize(unsigned size)
{
if (!size)
return;
m_sets.reserveInitialCapacity(size);
m_partitionedElements.resize(size);
m_elementPositionInPartitionedNodes.resize(size);
m_elementToSetMap.resize(size);
for (unsigned i = 0; i < size; ++i) {
m_partitionedElements[i] = i;
m_elementPositionInPartitionedNodes[i] = i;
m_elementToSetMap[i] = 0;
}
m_sets.uncheckedAppend(SetDescriptor { 0, size, 0 });
}
void reserveUninitializedCapacity(unsigned elementCount)
{
m_partitionedElements.resize(elementCount);
m_elementPositionInPartitionedNodes.resize(elementCount);
m_elementToSetMap.resize(elementCount);
}
void addSetUnchecked(unsigned start, unsigned size)
{
m_sets.append(SetDescriptor { start, size, 0 });
}
void setElementUnchecked(unsigned elementIndex, unsigned positionInPartition, unsigned setIndex)
{
ASSERT(setIndex < m_sets.size());
m_partitionedElements[positionInPartition] = elementIndex;
m_elementPositionInPartitionedNodes[elementIndex] = positionInPartition;
m_elementToSetMap[elementIndex] = setIndex;
}
unsigned startOffsetOfSet(unsigned setIndex) const
{
return m_sets[setIndex].start;
}
ALWAYS_INLINE void markElementInCurrentGeneration(unsigned elementIndex)
{
// Swap the node with the first unmarked node.
unsigned setIndex = m_elementToSetMap[elementIndex];
SetDescriptor& setDescriptor = m_sets[setIndex];
unsigned elementPositionInPartition = m_elementPositionInPartitionedNodes[elementIndex];
ASSERT(elementPositionInPartition >= setDescriptor.start);
ASSERT(elementPositionInPartition < setDescriptor.end());
unsigned firstUnmarkedElementPositionInPartition = setDescriptor.indexAfterMarkedElements();
ASSERT(firstUnmarkedElementPositionInPartition >= setDescriptor.start && firstUnmarkedElementPositionInPartition < setDescriptor.end());
ASSERT(firstUnmarkedElementPositionInPartition >= firstUnmarkedElementPositionInPartition);
// Swap the nodes in the set.
unsigned firstUnmarkedElement = m_partitionedElements[firstUnmarkedElementPositionInPartition];
m_partitionedElements[firstUnmarkedElementPositionInPartition] = elementIndex;
m_partitionedElements[elementPositionInPartition] = firstUnmarkedElement;
// Update their index.
m_elementPositionInPartitionedNodes[elementIndex] = firstUnmarkedElementPositionInPartition;
m_elementPositionInPartitionedNodes[firstUnmarkedElement] = elementPositionInPartition;
if (!setDescriptor.markedCount) {
ASSERT(!m_setsMarkedInCurrentGeneration.contains(setIndex));
m_setsMarkedInCurrentGeneration.append(setIndex);
}
++setDescriptor.markedCount;
}
// The function passed as argument MUST not modify the partition.
template<typename Function>
void refineGeneration(const Function& function)
{
for (unsigned setIndex : m_setsMarkedInCurrentGeneration) {
SetDescriptor& setDescriptor = m_sets[setIndex];
if (setDescriptor.markedCount == setDescriptor.size) {
// Everything is marked, there is nothing to refine.
setDescriptor.markedCount = 0;
continue;
}
SetDescriptor newSet;
bool newSetIsMarkedSet = setDescriptor.markedCount * 2 <= setDescriptor.size;
if (newSetIsMarkedSet) {
// Less than half of the nodes have been marked.
newSet = { setDescriptor.start, setDescriptor.markedCount, 0 };
setDescriptor.start = setDescriptor.start + setDescriptor.markedCount;
} else
newSet = { setDescriptor.start + setDescriptor.markedCount, setDescriptor.size - setDescriptor.markedCount, 0 };
setDescriptor.size -= newSet.size;
setDescriptor.markedCount = 0;
unsigned newSetIndex = m_sets.size();
m_sets.append(newSet);
for (unsigned i = newSet.start; i < newSet.end(); ++i)
m_elementToSetMap[m_partitionedElements[i]] = newSetIndex;
function(newSetIndex);
}
m_setsMarkedInCurrentGeneration.clear();
}
// Call Function() on every node of a given subset.
template<typename Function>
void iterateSet(unsigned setIndex, const Function& function)
{
SetDescriptor& setDescriptor = m_sets[setIndex];
for (unsigned i = setDescriptor.start; i < setDescriptor.end(); ++i)
function(m_partitionedElements[i]);
}
// Index of the set containing the Node.
unsigned setIndex(unsigned elementIndex) const
{
return m_elementToSetMap[elementIndex];
}
// NodeIndex of the first element in the set.
unsigned firstElementInSet(unsigned setIndex) const
{
return m_partitionedElements[m_sets[setIndex].start];
}
unsigned size() const
{
return m_sets.size();
}
private:
struct SetDescriptor {
unsigned start;
unsigned size;
unsigned markedCount;
unsigned indexAfterMarkedElements() const { return start + markedCount; }
unsigned end() const { return start + size; }
};
// List of sets.
Vector<SetDescriptor, 0, ContentExtensionsOverflowHandler> m_sets;
// All the element indices such that two elements of the same set never have a element of a different set between them.
Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_partitionedElements;
// Map elementIndex->position in the partitionedElements.
Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_elementPositionInPartitionedNodes;
// Map elementIndex->SetIndex.
Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_elementToSetMap;
// List of sets with any marked node. Each set can appear at most once.
// FIXME: find a good inline size for this.
Vector<unsigned, 128, ContentExtensionsOverflowHandler> m_setsMarkedInCurrentGeneration;
};
class FullGraphPartition {
typedef MutableRangeList<char, uint32_t, 128> SingularTransitionsMutableRangeList;
public:
FullGraphPartition(const DFA& dfa)
{
m_nodePartition.initialize(dfa.nodes.size());
SingularTransitionsMutableRangeList singularTransitions;
CounterConverter counterConverter;
for (const DFANode& node : dfa.nodes) {
if (node.isKilled())
continue;
auto transitions = node.transitions(dfa);
singularTransitions.extend(transitions.begin(), transitions.end(), counterConverter);
}
// Count the number of transition for each singular range. This will give us the bucket size
// for the transition partition, where transitions are partitioned by "symbol".
unsigned rangeIndexAccumulator = 0;
for (const auto& transition : singularTransitions) {
m_transitionPartition.addSetUnchecked(rangeIndexAccumulator, transition.data);
rangeIndexAccumulator += transition.data;
}
// Count the number of incoming transitions per node.
m_flattenedTransitionsStartOffsetPerNode.resize(dfa.nodes.size());
memset(m_flattenedTransitionsStartOffsetPerNode.data(), 0, m_flattenedTransitionsStartOffsetPerNode.size() * sizeof(unsigned));
Vector<char, 0, ContentExtensionsOverflowHandler> singularTransitionsFirsts;
singularTransitionsFirsts.reserveInitialCapacity(singularTransitions.m_ranges.size());
for (const auto& transition : singularTransitions)
singularTransitionsFirsts.uncheckedAppend(transition.first);
for (const DFANode& node : dfa.nodes) {
if (node.isKilled())
continue;
auto transitions = node.transitions(dfa);
iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned, const DFANode::ConstRangeIterator& origin) {
uint32_t targetNodeIndex = origin.target();
++m_flattenedTransitionsStartOffsetPerNode[targetNodeIndex];
});
}
// Accumulate the offsets. This gives us the start position of each bucket.
unsigned transitionAccumulator = 0;
for (unsigned i = 0; i < m_flattenedTransitionsStartOffsetPerNode.size(); ++i) {
unsigned transitionsCountForNode = m_flattenedTransitionsStartOffsetPerNode[i];
m_flattenedTransitionsStartOffsetPerNode[i] = transitionAccumulator;
transitionAccumulator += transitionsCountForNode;
}
unsigned flattenedTransitionsSize = transitionAccumulator;
ASSERT_WITH_MESSAGE(flattenedTransitionsSize == rangeIndexAccumulator, "The number of transitions should be the same, regardless of how they are arranged in buckets.");
m_transitionPartition.reserveUninitializedCapacity(flattenedTransitionsSize);
// Next, let's fill the transition table and set up the size of each group at the same time.
m_flattenedTransitionsSizePerNode.resize(dfa.nodes.size());
for (unsigned& counter : m_flattenedTransitionsSizePerNode)
counter = 0;
m_flattenedTransitions.resize(flattenedTransitionsSize);
Vector<uint32_t> transitionPerRangeOffset(m_transitionPartition.size());
memset(transitionPerRangeOffset.data(), 0, transitionPerRangeOffset.size() * sizeof(uint32_t));
for (unsigned i = 0; i < dfa.nodes.size(); ++i) {
const DFANode& node = dfa.nodes[i];
if (node.isKilled())
continue;
auto transitions = node.transitions(dfa);
iterateIntersections(singularTransitionsFirsts, transitions, [&](unsigned singularTransitonIndex, const DFANode::ConstRangeIterator& origin) {
uint32_t targetNodeIndex = origin.target();
unsigned start = m_flattenedTransitionsStartOffsetPerNode[targetNodeIndex];
unsigned offset = m_flattenedTransitionsSizePerNode[targetNodeIndex];
unsigned positionInFlattenedTransitions = start + offset;
m_flattenedTransitions[positionInFlattenedTransitions] = Transition({ i });
uint32_t& inRangeOffset = transitionPerRangeOffset[singularTransitonIndex];
unsigned positionInTransitionPartition = m_transitionPartition.startOffsetOfSet(singularTransitonIndex) + inRangeOffset;
++inRangeOffset;
m_transitionPartition.setElementUnchecked(positionInFlattenedTransitions, positionInTransitionPartition, singularTransitonIndex);
++m_flattenedTransitionsSizePerNode[targetNodeIndex];
});
}
}
void markNode(unsigned nodeIndex)
{
m_nodePartition.markElementInCurrentGeneration(nodeIndex);
}
void refinePartitions()
{
m_nodePartition.refineGeneration([&](unsigned smallestSetIndex) {
m_nodePartition.iterateSet(smallestSetIndex, [&](unsigned nodeIndex) {
unsigned incomingTransitionsStartForNode = m_flattenedTransitionsStartOffsetPerNode[nodeIndex];
unsigned incomingTransitionsSizeForNode = m_flattenedTransitionsSizePerNode[nodeIndex];
for (unsigned i = 0; i < incomingTransitionsSizeForNode; ++i)
m_transitionPartition.markElementInCurrentGeneration(incomingTransitionsStartForNode + i);
});
// We only need to split the transitions, we handle the new sets through the main loop.
m_transitionPartition.refineGeneration([](unsigned) { });
});
}
void splitByUniqueTransitions()
{
for (; m_nextTransitionSetToProcess < m_transitionPartition.size(); ++m_nextTransitionSetToProcess) {
// We use the known splitters to refine the set.
m_transitionPartition.iterateSet(m_nextTransitionSetToProcess, [&](unsigned transitionIndex) {
unsigned sourceNodeIndex = m_flattenedTransitions[transitionIndex].source;
m_nodePartition.markElementInCurrentGeneration(sourceNodeIndex);
});
refinePartitions();
}
}
unsigned nodeReplacement(unsigned nodeIndex)
{
unsigned setIndex = m_nodePartition.setIndex(nodeIndex);
return m_nodePartition.firstElementInSet(setIndex);
}
private:
struct Transition {
unsigned source;
};
struct CounterConverter {
uint32_t convert(uint32_t)
{
return 1;
}
void extend(uint32_t& destination, uint32_t)
{
++destination;
}
};
Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_flattenedTransitionsStartOffsetPerNode;
Vector<unsigned, 0, ContentExtensionsOverflowHandler> m_flattenedTransitionsSizePerNode;
Vector<Transition, 0, ContentExtensionsOverflowHandler> m_flattenedTransitions;
Partition m_nodePartition;
Partition m_transitionPartition;
unsigned m_nextTransitionSetToProcess { 0 };
};
struct ActionKey {
enum DeletedValueTag { DeletedValue };
explicit ActionKey(DeletedValueTag) { state = Deleted; }
enum EmptyValueTag { EmptyValue };
explicit ActionKey(EmptyValueTag) { state = Empty; }
explicit ActionKey(const DFA* dfa, uint32_t actionsStart, uint16_t actionsLength)
: dfa(dfa)
, actionsStart(actionsStart)
, actionsLength(actionsLength)
, state(Valid)
{
StringHasher hasher;
hasher.addCharactersAssumingAligned(reinterpret_cast<const UChar*>(&dfa->actions[actionsStart]), actionsLength * sizeof(uint64_t) / sizeof(UChar));
hash = hasher.hash();
}
bool isEmptyValue() const { return state == Empty; }
bool isDeletedValue() const { return state == Deleted; }
unsigned hash { 0 };
const DFA* dfa { nullptr };
uint32_t actionsStart { 0 };
uint16_t actionsLength { 0 };
enum {
Empty,
Valid,
Deleted
} state { Empty };
};
struct ActionKeyHash {
static unsigned hash(const ActionKey& actionKey)
{
return actionKey.hash;
}
static bool equal(const ActionKey& a, const ActionKey& b)
{
if (a.state != b.state
|| a.dfa != b.dfa
|| a.actionsLength != b.actionsLength)
return false;
for (uint16_t i = 0; i < a.actionsLength; ++i) {
if (a.dfa->actions[a.actionsStart + i] != a.dfa->actions[b.actionsStart + i])
return false;
}
return true;
}
static const bool safeToCompareToEmptyOrDeleted = false;
};
struct ActionKeyHashTraits : public WTF::CustomHashTraits<ActionKey> {
static const bool emptyValueIsZero = true;
};
} // anonymous namespace.
void DFAMinimizer::minimize(DFA& dfa)
{
FullGraphPartition fullGraphPartition(dfa);
// Unlike traditional minimization final states can be differentiated by their action.
// Instead of creating a single set for the final state, we partition by actions from
// the start.
HashMap<ActionKey, Vector<unsigned>, ActionKeyHash, ActionKeyHashTraits> finalStates;
for (unsigned i = 0; i < dfa.nodes.size(); ++i) {
const DFANode& node = dfa.nodes[i];
if (node.hasActions()) {
// FIXME: Sort the actions in the dfa to make nodes that have the same actions in different order equal.
auto addResult = finalStates.add(ActionKey(&dfa, node.actionsStart(), node.actionsLength()), Vector<unsigned>());
addResult.iterator->value.append(i);
}
}
for (const auto& slot : finalStates) {
for (unsigned finalStateIndex : slot.value)
fullGraphPartition.markNode(finalStateIndex);
fullGraphPartition.refinePartitions();
}
// Use every splitter to refine the node partitions.
fullGraphPartition.splitByUniqueTransitions();
Vector<unsigned> relocationVector;
relocationVector.reserveInitialCapacity(dfa.nodes.size());
for (unsigned i = 0; i < dfa.nodes.size(); ++i)
relocationVector.uncheckedAppend(i);
// Update all the transitions.
for (unsigned i = 0; i < dfa.nodes.size(); ++i) {
unsigned replacement = fullGraphPartition.nodeReplacement(i);
if (i != replacement) {
relocationVector[i] = replacement;
dfa.nodes[i].kill(dfa);
}
}
dfa.root = relocationVector[dfa.root];
for (DFANode& node : dfa.nodes) {
if (node.isKilled())
continue;
for (auto& transition : node.transitions(dfa)) {
uint32_t target = transition.target();
uint32_t relocatedTarget = relocationVector[target];
if (target != relocatedTarget)
transition.resetTarget(relocatedTarget);
}
}
}
} // namespace ContentExtensions
} // namespace WebCore
#endif // ENABLE(CONTENT_EXTENSIONS)