third_party_icu/tools/scripts/icu-file-utf8-check.py

#! /usr/bin/python -B

# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html

# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
# All rights reserved.

#
#  Script to check that ICU source files contain only valid UTF-8 encoded text,
#  and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
#
#  THIS SCRIPT DOES NOT WORK ON WINDOWS
#     It only works correctly on platforms where the native line ending is a plain \n
#
#  usage:
#     icu-file-utf8-check.py  [options]
#
#  options:
#     -h | --help    Print a usage line and exit.
#
#  The tool operates recursively on the directory from which it is run.
#  Only files from the ICU github repository are checked.
#  No changes are made to the repository; only the working copy will be altered.
#  The script checks all source files and returns a non-zero exit code if any of
#  the checked files contain a non-UTF-8 character.

from __future__ import print_function

import sys
import os
import os.path
import re
import getopt


# List of directories to check for UTF-8 and BOM. Currently covers
# all of icu/. Modify as needed.
icu_directories_to_be_scanned = ["."]

# Files that are allowed to contain \r line endings. If this list
# grows too long consider a file instead.
ignore_cr_in_files = [
    "vendor/double-conversion/upstream/msvc/testrunner.cmd"
    ]

def runCommand(cmd):
    output_file = os.popen(cmd);
    output_text = output_file.read();
    exit_status = output_file.close();

    return output_text, exit_status


def usage():
    print("usage: " + sys.argv[0] + " [-h | --help]")


#
#  File check.         Check source code files for UTF-8 and all except text files for not containing a BOM
#    file_name:        name of a text file.
#    is_source:        Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
#
def check_file(file_name, is_source):
    rc = 0
    f = open(file_name, 'rb')
    bytes = f.read()
    f.close()

    if is_source:
        try:
            bytes.decode("UTF-8")
        except UnicodeDecodeError:
            print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
            rc = 1

    if bytes[0] == 0xef:
        if not (file_name.endswith(".txt") or file_name.endswith(".sln")
                    or file_name.endswith(".targets") or ".vcxproj" in file_name):
            print("Warning: file %s contains a UTF-8 BOM: " % file_name)
            rc = 1

    return rc

def main(argv):
    exit_status = 0
    rc = 0

    try:
        opts, args = getopt.getopt(argv, "h", ("help"))
    except getopt.GetoptError:
        print("unrecognized option: " + argv[0])
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
    if args:
        print("unexpected command line argument")
        usage()
        sys.exit(2)

    source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
    git_cmd = "git ls-files DIR"

    for dir in icu_directories_to_be_scanned:
        print('Scanning ' + dir)
        cmd = git_cmd.replace("DIR", dir)
        output, rc = runCommand(cmd)
        if rc:
            print('"', cmd, '" failed. Exiting.', file=sys.stderr)
        file_list = output.splitlines()

        for f in file_list:
            if os.path.isdir(f):
                print("Skipping dir " + f)
                continue
            if not os.path.isfile(f):
                print("Repository file not in working copy: " + f)
                continue;

            source_file = source_file_re.match(f)
            if check_file(f, source_file) != 0:
                exit_status = 1

            # Lastly, check the line endings of the file.
            # Note that 'grep' returns null if it reports a file,
            # a non-null value otherwise.
            output, rc = runCommand("grep -rPIl \"\\r\" " + f)
            if (rc is None):
                if f not in ignore_cr_in_files:
                    print("File ", f, " has \\r line ending")
                    exit_status = 1

    print(exit_status)
    sys.exit(exit_status)

if __name__ == "__main__":
    main(sys.argv[1:])
Signed-off-by: mengjingzhimo <luoboming@huawei.com> Changes to be committed: modified: KEYS modified: README.OpenSource modified: README.md 2022-08-31 12:27:49 +00:00			`#! /usr/bin/python -B`

			`# Copyright (C) 2016 and later: Unicode, Inc. and others.`
			`# License & terms of use: http://www.unicode.org/copyright.html`

			`# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.`
			`# All rights reserved.`

			`#`
			`# Script to check that ICU source files contain only valid UTF-8 encoded text,`
			`# and that all files except '.txt' files do not contain a Byte Order Mark (BOM).`
			`#`
			`# THIS SCRIPT DOES NOT WORK ON WINDOWS`
			`# It only works correctly on platforms where the native line ending is a plain \n`
			`#`
			`# usage:`
			`# icu-file-utf8-check.py [options]`
			`#`
			`# options:`
			`# -h \| --help Print a usage line and exit.`
			`#`
			`# The tool operates recursively on the directory from which it is run.`
			`# Only files from the ICU github repository are checked.`
			`# No changes are made to the repository; only the working copy will be altered.`
			`# The script checks all source files and returns a non-zero exit code if any of`
			`# the checked files contain a non-UTF-8 character.`

			`from __future__ import print_function`

			`import sys`
			`import os`
			`import os.path`
			`import re`
			`import getopt`


			`# List of directories to check for UTF-8 and BOM. Currently covers`
			`# all of icu/. Modify as needed.`
			`icu_directories_to_be_scanned = ["."]`

			`# Files that are allowed to contain \r line endings. If this list`
			`# grows too long consider a file instead.`
			`ignore_cr_in_files = [`
			`"vendor/double-conversion/upstream/msvc/testrunner.cmd"`
			`]`

			`def runCommand(cmd):`
			`output_file = os.popen(cmd);`
			`output_text = output_file.read();`
			`exit_status = output_file.close();`

			`return output_text, exit_status`


			`def usage():`
			`print("usage: " + sys.argv[0] + " [-h \| --help]")`


			`#`
			`# File check. Check source code files for UTF-8 and all except text files for not containing a BOM`
			`# file_name: name of a text file.`
			`# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).`
			`#`
			`def check_file(file_name, is_source):`
			`rc = 0`
			`f = open(file_name, 'rb')`
			`bytes = f.read()`
			`f.close()`

			`if is_source:`
			`try:`
			`bytes.decode("UTF-8")`
			`except UnicodeDecodeError:`
			`print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)`
			`rc = 1`

			`if bytes[0] == 0xef:`
			`if not (file_name.endswith(".txt") or file_name.endswith(".sln")`
			`or file_name.endswith(".targets") or ".vcxproj" in file_name):`
			`print("Warning: file %s contains a UTF-8 BOM: " % file_name)`
			`rc = 1`

			`return rc`

			`def main(argv):`
			`exit_status = 0`
			`rc = 0`

			`try:`
			`opts, args = getopt.getopt(argv, "h", ("help"))`
			`except getopt.GetoptError:`
			`print("unrecognized option: " + argv[0])`
			`usage()`
			`sys.exit(2)`
			`for opt, arg in opts:`
			`if opt in ("-h", "--help"):`
			`usage()`
			`sys.exit()`
			`if args:`
			`print("unexpected command line argument")`
			`usage()`
			`sys.exit(2)`

			`source_file_re = re.compile(".*((?:\\.c$)\|(?:\\.cpp$)\|(?:\\.h$)\|(?:\\.java$))")`
			`git_cmd = "git ls-files DIR"`

			`for dir in icu_directories_to_be_scanned:`
			`print('Scanning ' + dir)`
			`cmd = git_cmd.replace("DIR", dir)`
			`output, rc = runCommand(cmd)`
			`if rc:`
			`print('"', cmd, '" failed. Exiting.', file=sys.stderr)`
			`file_list = output.splitlines()`

			`for f in file_list:`
			`if os.path.isdir(f):`
			`print("Skipping dir " + f)`
			`continue`
			`if not os.path.isfile(f):`
			`print("Repository file not in working copy: " + f)`
			`continue;`

			`source_file = source_file_re.match(f)`
			`if check_file(f, source_file) != 0:`
			`exit_status = 1`

			`# Lastly, check the line endings of the file.`
			`# Note that 'grep' returns null if it reports a file,`
			`# a non-null value otherwise.`
			`output, rc = runCommand("grep -rPIl \"\\r\" " + f)`
			`if (rc is None):`
			`if f not in ignore_cr_in_files:`
			`print("File ", f, " has \\r line ending")`
			`exit_status = 1`

			`print(exit_status)`
			`sys.exit(exit_status)`

			`if __name__ == "__main__":`
			`main(sys.argv[1:])`