Bug 1083971 - Add an option to output a binary file for the PSL data r=leplatrem,erahm

Differential Revision: https://phabricator.services.mozilla.com/D34364

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Arpit Bharti 2019-07-02 12:28:48 +00:00
parent e8c093c291
commit 3ac5afd32b
2 changed files with 43 additions and 10 deletions

View File

@ -8,7 +8,7 @@ import imp
import os
import re
import sys
from make_dafsa import words_to_cxx
from make_dafsa import words_to_cxx, words_to_bin
"""
Processes a file containing effective TLD data. See the following URL for a
@ -98,11 +98,12 @@ class EffectiveTLDEntry:
# DO EVERYTHING #
#################
def main(output, effective_tld_filename):
def main(output, effective_tld_filename, output_format="cxx"):
"""
effective_tld_filename is the effective TLD file to parse.
A C++ array of a binary representation of a DAFSA representing the
eTLD file is then printed to output.
based on the output format, either a C++ array of a binary representation
of a DAFSA representing the eTLD file is then printed to standard output
or a binary file is written to disk.
"""
def typeEnum(etld):
@ -123,7 +124,26 @@ def main(output, effective_tld_filename):
for etld in getEffectiveTLDs(effective_tld_filename):
yield "%s%d" % (etld.domain(), typeEnum(etld))
output.write(words_to_cxx(dafsa_words()))
""" words_to_bin() returns a bytes while words_to_cxx() returns string """
if output_format == "bin":
if sys.version_info[0] >= 3:
output = output.buffer
output.write(words_to_bin(dafsa_words()))
else:
output.write(words_to_cxx(dafsa_words()))
if __name__ == '__main__':
main(sys.stdout, sys.argv[1])
"""
This program can output the DAFSA in two formats:
as C++ code that will be included and compiled at build time
or as a binary file that will be published in Remote Settings.
Flags for format options:
"cxx" -> C++ array [default]
"bin" -> Binary file
"""
output_format = "bin" if "--bin" in sys.argv else "cxx"
main(sys.stdout, sys.argv[1], output_format=output_format)

View File

@ -193,6 +193,7 @@ The bytes in the generated array has the following meaning:
"""
import sys
import struct
class InputError(Exception):
@ -382,7 +383,7 @@ def encode_prefix(label):
"""Encodes a node label as a list of bytes without a trailing high byte.
This method encodes a node if there is exactly one child and the
child follows immidiately after so that no jump is needed. This label
child follows immediately after so that no jump is needed. This label
will then be a prefix to the label in the child node.
"""
assert label
@ -416,6 +417,13 @@ def encode(dafsa):
output.reverse()
return output
def encode_words(words):
"""Generates a dafsa representation of a word list"""
dafsa = to_dafsa(words)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return dafsa
def to_cxx(data, preamble=None):
"""Generates C++ code from a list of encoded bytes."""
@ -439,12 +447,17 @@ def to_cxx(data, preamble=None):
def words_to_cxx(words, preamble=None):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
dafsa = encode_words(words)
return to_cxx(encode(dafsa), preamble)
def words_to_bin(words):
"""Generates bytes from a word list"""
dafsa = encode_words(words)
data = encode(dafsa)
return struct.pack('%dB' % len(data), *data)
def parse_gperf(infile):
"""Parses gperf file and extract strings and return code"""
lines = [line.strip() for line in infile]