third_party_rust_regex/scripts/frequencies.py

#!/usr/bin/env python

# This does simple normalized frequency analysis on UTF-8 encoded text. The
# result of the analysis is translated to a ranked list, where every byte is
# assigned a rank. This list is written to src/freqs.rs.
#
# Currently, the frequencies are generated from the following corpuses:
#
#   * The CIA world fact book
#   * The source code of rustc
#   * Septuaginta

from __future__ import absolute_import, division, print_function

import argparse
from collections import Counter
import sys

preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// NOTE: The following code was generated by "scripts/frequencies.py", do not
// edit directly
'''


def eprint(*args, **kwargs):
    kwargs['file'] = sys.stderr
    print(*args, **kwargs)


def main():
    p = argparse.ArgumentParser()
    p.add_argument('corpus', metavar='FILE', nargs='+')
    args = p.parse_args()

    # Get frequency counts of each byte.
    freqs = Counter()
    for i in range(0, 256):
        freqs[i] = 0

    eprint('reading entire corpus into memory')
    corpus = []
    for fpath in args.corpus:
        corpus.append(open(fpath, 'rb').read())

    eprint('computing byte frequencies')
    for c in corpus:
        for byte in c:
            freqs[byte] += 1.0 / float(len(c))

    eprint('writing Rust code')
    # Get the rank of each byte. A lower rank => lower relative frequency.
    rank = [0] * 256
    for i, (byte, _) in enumerate(freqs.most_common()):
        # print(byte)
        rank[byte] = 255 - i

    # Forcefully set the highest rank possible for bytes that start multi-byte
    # UTF-8 sequences. The idea here is that a continuation byte will be more
    # discerning in a homogenous haystack.
    for byte in range(0xC0, 0xFF + 1):
        rank[byte] = 255

    # Now write Rust.
    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
    for byte in range(256):
        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
    olines.append('];')

    print(preamble)
    print('\n'.join(olines))

if __name__ == '__main__':
    main()