2017-07-17 23:09:42 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
|
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
|
|
# found in the LICENSE file.
|
|
|
|
|
|
|
|
"""
|
|
|
|
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
|
|
|
representation of an unordered word list (dictionary).
|
|
|
|
|
|
|
|
http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
|
|
|
|
|
|
|
|
This python program converts a list of strings to a byte array in C++.
|
|
|
|
This python program fetches strings and return values from a gperf file
|
|
|
|
and generates a C++ file with a byte array representing graph that can be
|
|
|
|
used as a memory efficient replacement for the perfect hash table.
|
|
|
|
|
|
|
|
The input strings are assumed to consist of printable 7-bit ASCII characters
|
|
|
|
and the return values are assumed to be one digit integers.
|
|
|
|
|
|
|
|
In this program a DAFSA is a diamond shaped graph starting at a common
|
2020-07-13 19:03:04 +00:00
|
|
|
root node and ending at a common end node. All internal nodes contain
|
|
|
|
a character and each word is represented by the characters in one path from
|
|
|
|
the root node to the end node.
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
The order of the operations is crucial since lookups will be performed
|
|
|
|
starting from the source with no backtracking. Thus a node must have at
|
|
|
|
most one child with a label starting by the same character. The output
|
|
|
|
is also arranged so that all jumps are to increasing addresses, thus forward
|
|
|
|
in memory.
|
|
|
|
|
|
|
|
The generated output has suffix free decoding so that the sign of leading
|
|
|
|
bits in a link (a reference to a child node) indicate if it has a size of one,
|
|
|
|
two or three bytes and if it is the last outgoing link from the actual node.
|
|
|
|
A node label is terminated by a byte with the leading bit set.
|
|
|
|
|
|
|
|
The generated byte array can described by the following BNF:
|
|
|
|
|
|
|
|
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
|
|
|
|
|
|
|
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
|
|
|
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
|
|
|
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
|
|
|
|
|
|
|
<offset1> ::= < byte in range [0x00-0x3F] >
|
|
|
|
<offset2> ::= < byte in range [0x40-0x5F] >
|
|
|
|
<offset3> ::= < byte in range [0x60-0x7F] >
|
|
|
|
|
|
|
|
<end_offset1> ::= < byte in range [0x80-0xBF] >
|
|
|
|
<end_offset2> ::= < byte in range [0xC0-0xDF] >
|
|
|
|
<end_offset3> ::= < byte in range [0xE0-0xFF] >
|
|
|
|
|
|
|
|
<prefix> ::= <char>
|
|
|
|
|
|
|
|
<label> ::= <end_char>
|
|
|
|
| <char> <label>
|
|
|
|
|
|
|
|
<end_label> ::= <return_value>
|
|
|
|
| <char> <end_label>
|
|
|
|
|
|
|
|
<offset> ::= <offset1>
|
|
|
|
| <offset2> <byte>
|
|
|
|
| <offset3> <byte> <byte>
|
|
|
|
|
|
|
|
<end_offset> ::= <end_offset1>
|
|
|
|
| <end_offset2> <byte>
|
|
|
|
| <end_offset3> <byte> <byte>
|
|
|
|
|
|
|
|
<offsets> ::= <end_offset>
|
|
|
|
| <offset> <offsets>
|
|
|
|
|
|
|
|
<source> ::= <offsets>
|
|
|
|
|
|
|
|
<node> ::= <label> <offsets>
|
|
|
|
| <prefix> <node>
|
|
|
|
| <end_label>
|
|
|
|
|
|
|
|
<dafsa> ::= <source>
|
|
|
|
| <dafsa> <node>
|
|
|
|
|
|
|
|
Decoding:
|
|
|
|
|
|
|
|
<char> -> printable 7-bit ASCII character
|
|
|
|
<end_char> & 0x7F -> printable 7-bit ASCII character
|
|
|
|
<return value> & 0x0F -> integer
|
|
|
|
<offset1 & 0x3F> -> integer
|
|
|
|
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
|
|
|
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
|
|
|
|
|
|
|
|
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
|
|
|
|
offset2 and offset3 respectively.
|
|
|
|
|
|
|
|
The first offset in a list of offsets is the distance in bytes between the
|
|
|
|
offset itself and the first child node. Subsequent offsets are the distance
|
|
|
|
between previous child node and next child node. Thus each offset links a node
|
|
|
|
to a child node. The distance is always counted between start addresses, i.e.
|
|
|
|
first byte in decoded offset or first byte in child node.
|
|
|
|
|
|
|
|
Example 1:
|
|
|
|
|
|
|
|
%%
|
|
|
|
aa, 1
|
|
|
|
a, 2
|
|
|
|
%%
|
|
|
|
|
|
|
|
The input is first parsed to a list of words:
|
|
|
|
["aa1", "a2"]
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
This produces the following graph:
|
|
|
|
[root] --- a --- 0x02 --- [end]
|
|
|
|
| /
|
|
|
|
| /
|
|
|
|
- a --- 0x01
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
A C++ representation of the compressed graph is generated:
|
|
|
|
|
|
|
|
const unsigned char dafsa[7] = {
|
|
|
|
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
|
|
|
|
};
|
|
|
|
|
|
|
|
The bytes in the generated array has the following meaning:
|
|
|
|
|
|
|
|
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
|
|
|
|
|
|
|
|
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
|
|
|
|
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
|
|
|
|
|
|
|
|
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
|
|
|
|
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
|
|
|
|
|
|
|
5: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
|
|
|
|
|
|
|
Example 2:
|
|
|
|
|
|
|
|
%%
|
|
|
|
aa, 1
|
|
|
|
bbb, 2
|
|
|
|
baa, 1
|
|
|
|
%%
|
|
|
|
|
|
|
|
The input is first parsed to a list of words:
|
|
|
|
["aa1", "bbb2", "baa1"]
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
This produces the following graph:
|
|
|
|
[root] --- a --- a --- 0x01 --- [end]
|
|
|
|
| / / /
|
|
|
|
| / / /
|
|
|
|
- b --- b --- b --- 0x02
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
A C++ representation of the compressed graph is generated:
|
|
|
|
|
|
|
|
const unsigned char dafsa[11] = {
|
|
|
|
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
|
|
|
|
};
|
|
|
|
|
|
|
|
The bytes in the generated array has the following meaning:
|
|
|
|
|
|
|
|
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
|
|
|
|
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
|
|
|
|
|
|
|
|
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
|
|
|
|
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
|
|
|
|
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
|
|
|
|
|
|
|
|
5: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
6: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
|
|
|
|
|
|
|
8: 0x62 <char> label character 0x62 -> match "b"
|
|
|
|
9: 0x62 <char> label character 0x62 -> match "b"
|
|
|
|
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
|
|
|
"""
|
|
|
|
import sys
|
2019-07-02 12:28:48 +00:00
|
|
|
import struct
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
from incremental_dafsa import Dafsa, Node
|
2020-07-10 21:08:10 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
|
2017-07-17 23:09:42 +00:00
|
|
|
class InputError(Exception):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Exception raised for errors in the input file."""
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
def top_sort(dafsa: Dafsa):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Generates list of nodes in topological sort order."""
|
|
|
|
incoming = {}
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
def count_incoming(node: Node):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Counts incoming references."""
|
2020-07-13 19:03:04 +00:00
|
|
|
if not node.is_end_node:
|
2018-05-26 04:29:52 +00:00
|
|
|
if id(node) not in incoming:
|
|
|
|
incoming[id(node)] = 1
|
2020-07-13 19:03:04 +00:00
|
|
|
for child in node.children.values():
|
2018-05-26 04:29:52 +00:00
|
|
|
count_incoming(child)
|
|
|
|
else:
|
|
|
|
incoming[id(node)] += 1
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
for node in dafsa.root_node.children.values():
|
2018-05-26 04:29:52 +00:00
|
|
|
count_incoming(node)
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
for node in dafsa.root_node.children.values():
|
2018-05-26 04:29:52 +00:00
|
|
|
incoming[id(node)] -= 1
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
waiting = [
|
|
|
|
node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
|
|
|
|
]
|
2018-05-26 04:29:52 +00:00
|
|
|
nodes = []
|
|
|
|
|
|
|
|
while waiting:
|
|
|
|
node = waiting.pop()
|
|
|
|
assert incoming[id(node)] == 0
|
|
|
|
nodes.append(node)
|
2020-07-13 19:03:04 +00:00
|
|
|
for child in node.children.values():
|
|
|
|
if not child.is_end_node:
|
2018-05-26 04:29:52 +00:00
|
|
|
incoming[id(child)] -= 1
|
|
|
|
if incoming[id(child)] == 0:
|
|
|
|
waiting.append(child)
|
|
|
|
return nodes
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
def encode_links(node: Node, offsets, current):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Encodes a list of children as one, two or three byte offsets."""
|
2020-07-13 19:03:04 +00:00
|
|
|
if next(iter(node.children.values())).is_end_node:
|
2018-05-26 04:29:52 +00:00
|
|
|
# This is an <end_label> node and no links follow such nodes
|
|
|
|
return []
|
2020-07-13 19:03:04 +00:00
|
|
|
guess = 3 * len(node.children)
|
|
|
|
assert node.children
|
|
|
|
|
|
|
|
children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
|
2018-05-26 04:29:52 +00:00
|
|
|
while True:
|
|
|
|
offset = current + guess
|
|
|
|
buf = []
|
|
|
|
for child in children:
|
|
|
|
last = len(buf)
|
|
|
|
distance = offset - offsets[id(child)]
|
|
|
|
assert distance > 0 and distance < (1 << 21)
|
|
|
|
|
|
|
|
if distance < (1 << 6):
|
|
|
|
# A 6-bit offset: "s0xxxxxx"
|
|
|
|
buf.append(distance)
|
|
|
|
elif distance < (1 << 13):
|
|
|
|
# A 13-bit offset: "s10xxxxxxxxxxxxx"
|
|
|
|
buf.append(0x40 | (distance >> 8))
|
|
|
|
buf.append(distance & 0xFF)
|
|
|
|
else:
|
|
|
|
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
|
|
|
|
buf.append(0x60 | (distance >> 16))
|
|
|
|
buf.append((distance >> 8) & 0xFF)
|
|
|
|
buf.append(distance & 0xFF)
|
|
|
|
# Distance in first link is relative to following record.
|
|
|
|
# Distance in other links are relative to previous link.
|
|
|
|
offset -= distance
|
|
|
|
if len(buf) == guess:
|
|
|
|
break
|
|
|
|
guess = len(buf)
|
|
|
|
# Set most significant bit to mark end of links in this node.
|
|
|
|
buf[last] |= 1 << 7
|
|
|
|
buf.reverse()
|
|
|
|
return buf
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
def encode_prefix(label):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Encodes a node label as a list of bytes without a trailing high byte.
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
This method encodes a node if there is exactly one child and the
|
2019-07-02 12:28:48 +00:00
|
|
|
child follows immediately after so that no jump is needed. This label
|
2018-05-26 04:29:52 +00:00
|
|
|
will then be a prefix to the label in the child node.
|
|
|
|
"""
|
|
|
|
assert label
|
|
|
|
return [ord(c) for c in reversed(label)]
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
def encode_label(label):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Encodes a node label as a list of bytes with a trailing high byte >0x80."""
|
|
|
|
buf = encode_prefix(label)
|
|
|
|
# Set most significant bit to mark end of label in this node.
|
|
|
|
buf[0] |= 1 << 7
|
|
|
|
return buf
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
def encode(dafsa: Dafsa):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Encodes a DAFSA to a list of bytes"""
|
|
|
|
output = []
|
|
|
|
offsets = {}
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
for node in reversed(top_sort(dafsa)):
|
2020-07-13 19:03:04 +00:00
|
|
|
if (
|
|
|
|
len(node.children) == 1
|
|
|
|
and not next(iter(node.children.values())).is_end_node
|
|
|
|
and (offsets[id(next(iter(node.children.values())))] == len(output))
|
|
|
|
):
|
|
|
|
output.extend(encode_prefix(node.character))
|
2018-05-26 04:29:52 +00:00
|
|
|
else:
|
2020-07-13 19:03:04 +00:00
|
|
|
output.extend(encode_links(node, offsets, len(output)))
|
|
|
|
output.extend(encode_label(node.character))
|
2018-05-26 04:29:52 +00:00
|
|
|
offsets[id(node)] = len(output)
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2020-07-13 19:03:04 +00:00
|
|
|
output.extend(encode_links(dafsa.root_node, offsets, len(output)))
|
2018-05-26 04:29:52 +00:00
|
|
|
output.reverse()
|
|
|
|
return output
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2019-07-02 13:04:46 +00:00
|
|
|
|
2017-08-11 21:12:03 +00:00
|
|
|
def to_cxx(data, preamble=None):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Generates C++ code from a list of encoded bytes."""
|
|
|
|
text = "/* This file is generated. DO NOT EDIT!\n\n"
|
|
|
|
text += "The byte array encodes a dictionary of strings and values. See "
|
|
|
|
text += "make_dafsa.py for documentation."
|
|
|
|
text += "*/\n\n"
|
2017-08-11 21:12:03 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
if preamble:
|
|
|
|
text += preamble
|
|
|
|
text += "\n\n"
|
2017-08-11 21:12:03 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
text += "const unsigned char kDafsa[%s] = {\n" % len(data)
|
|
|
|
for i in range(0, len(data), 12):
|
|
|
|
text += " "
|
|
|
|
text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
|
|
|
|
text += ",\n"
|
|
|
|
text += "};\n"
|
|
|
|
return text
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
2017-08-11 21:12:03 +00:00
|
|
|
def words_to_cxx(words, preamble=None):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Generates C++ code from a word list"""
|
2020-07-10 21:08:10 +00:00
|
|
|
dafsa = Dafsa.from_tld_data(words)
|
2018-05-26 04:29:52 +00:00
|
|
|
return to_cxx(encode(dafsa), preamble)
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
2019-07-02 12:28:48 +00:00
|
|
|
def words_to_bin(words):
|
|
|
|
"""Generates bytes from a word list"""
|
2020-07-10 21:08:10 +00:00
|
|
|
dafsa = Dafsa.from_tld_data(words)
|
2019-07-02 12:28:48 +00:00
|
|
|
data = encode(dafsa)
|
|
|
|
return struct.pack("%dB" % len(data), *data)
|
|
|
|
|
|
|
|
|
2017-07-17 23:09:42 +00:00
|
|
|
def parse_gperf(infile):
|
2018-05-26 04:29:52 +00:00
|
|
|
"""Parses gperf file and extract strings and return code"""
|
|
|
|
lines = [line.strip() for line in infile]
|
|
|
|
|
|
|
|
# Extract the preamble.
|
|
|
|
first_delimeter = lines.index("%%")
|
|
|
|
preamble = "\n".join(lines[0:first_delimeter])
|
|
|
|
|
|
|
|
# Extract strings after the first '%%' and before the second '%%'.
|
|
|
|
begin = first_delimeter + 1
|
|
|
|
end = lines.index("%%", begin)
|
|
|
|
lines = lines[begin:end]
|
|
|
|
for line in lines:
|
|
|
|
if line[-3:-1] != ", ":
|
|
|
|
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
|
|
|
# Technically the DAFSA format could support return values in range [0-31],
|
|
|
|
# but the values below are the only with a defined meaning.
|
|
|
|
if line[-1] not in "0124":
|
|
|
|
raise InputError(
|
|
|
|
'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
|
|
|
|
)
|
|
|
|
return (preamble, [line[:-3] + line[-1] for line in lines])
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main(outfile, infile):
|
2018-05-26 04:29:52 +00:00
|
|
|
with open(infile, "r") as infile:
|
|
|
|
preamble, words = parse_gperf(infile)
|
|
|
|
outfile.write(words_to_cxx(words, preamble))
|
|
|
|
return 0
|
2017-07-17 23:09:42 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2018-05-26 04:29:52 +00:00
|
|
|
if len(sys.argv) != 3:
|
|
|
|
print("usage: %s infile outfile" % sys.argv[0])
|
|
|
|
sys.exit(1)
|
2017-07-17 23:09:42 +00:00
|
|
|
|
2018-05-26 04:29:52 +00:00
|
|
|
with open(sys.argv[2], "w") as outfile:
|
|
|
|
sys.exit(main(outfile, sys.argv[1]))
|