mirror of
https://github.com/openharmony/third_party_gettext.git
synced 2026-07-01 10:25:03 -04:00
xgettext: Assume that Python source files are in UTF-8 by default.
Reported by ilias iliadis <apiuano-bugs@yahoo.gr> at <https://savannah.gnu.org/bugs/?55231>. * gettext-tools/src/xg-encoding.h (xgettext_global_source_encoding): Allow a NULL value. * gettext-tools/src/xg-encoding.c (xgettext_global_source_encoding): Likewise. * gettext-tools/src/xgettext.c (main): Initialize xgettext_global_source_encoding with NULL, not "ASCII". Handle a NULL xgettext_global_source_encoding. (extract_from_file): Use ASCII as default for xgettext_global_source_encoding. * gettext-tools/src/x-javascript.c (extract_javascript): Likewise. * gettext-tools/src/x-python.c (phase2_getc): Signal an error when encountering an invalid or incomplete UTF-8 character. (set_current_file_source_encoding): In the error message, use xgettext_current_file_source_encoding instead of xgettext_global_source_encoding. (extract_python): Use UTF-8 as default for xgettext_global_source_encoding. * gettext-tools/tests/xgettext-python-3: Verify that if the source file has no magic coding comment but is UTF-8 encoded, xgettext succeeds. * NEWS: Mention the change.
This commit is contained in:
@@ -8,6 +8,9 @@ Version 0.21 - August 2019
|
||||
o xgettext now recognizes 'gettext' program invocations with the '-e'
|
||||
option, such as
|
||||
gettext -e 'some\nstring\n'
|
||||
- Python:
|
||||
xgettext now assumes a Python source file is in UTF-8 encoding by default,
|
||||
as stated in PEP 3120.
|
||||
- Java:
|
||||
xgettext now recognizes format strings in the Formatter syntax. They
|
||||
are marked as 'java-printf-format' in POT and PO files.
|
||||
|
||||
@@ -1717,7 +1717,9 @@ extract_javascript (FILE *f,
|
||||
xml_element_depth = 0;
|
||||
inside_embedded_js_in_xml = false;
|
||||
|
||||
xgettext_current_file_source_encoding = xgettext_global_source_encoding;
|
||||
xgettext_current_file_source_encoding =
|
||||
(xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
|
||||
po_charset_ascii);
|
||||
#if HAVE_ICONV
|
||||
xgettext_current_file_source_iconv = xgettext_global_source_iconv;
|
||||
#endif
|
||||
|
||||
+116
-87
@@ -321,13 +321,7 @@ as specified in https://www.python.org/peps/pep-0263.html.\n")));
|
||||
if (errno == EILSEQ)
|
||||
{
|
||||
/* An invalid multibyte sequence was encountered. */
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Invalid multibyte sequence.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number));
|
||||
exit (EXIT_FAILURE);
|
||||
goto invalid;
|
||||
}
|
||||
else if (errno == EINVAL)
|
||||
{
|
||||
@@ -350,25 +344,9 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
/* Read one more byte and retry iconv. */
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
{
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Incomplete multibyte sequence at end of file.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number));
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
goto incomplete_at_eof;
|
||||
if (c == '\n')
|
||||
{
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Incomplete multibyte sequence at end of line.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number - 1));
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
goto incomplete_at_eol;
|
||||
buf[bufcount++] = (unsigned char) c;
|
||||
}
|
||||
else
|
||||
@@ -394,13 +372,7 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
{
|
||||
/* scratchbuf contains an out-of-range Unicode character
|
||||
(> 0x10ffff). */
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Invalid multibyte sequence.\n\
|
||||
Please specify the source encoding through --from-code or through a comment\n\
|
||||
as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number));
|
||||
exit (EXIT_FAILURE);
|
||||
goto invalid;
|
||||
}
|
||||
return uc;
|
||||
}
|
||||
@@ -414,76 +386,129 @@ as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Read an UTF-8 encoded character. */
|
||||
unsigned char buf[6];
|
||||
unsigned int count;
|
||||
/* Read an UTF-8 encoded character.
|
||||
Reject invalid input, like u8_mbtouc does. */
|
||||
int c;
|
||||
ucs4_t uc;
|
||||
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[0] = c;
|
||||
count = 1;
|
||||
|
||||
if (buf[0] >= 0xc0)
|
||||
if (c < 0x80)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[1] = c;
|
||||
count = 2;
|
||||
uc = c;
|
||||
}
|
||||
|
||||
if (buf[0] >= 0xe0
|
||||
&& ((buf[1] ^ 0x80) < 0x40))
|
||||
else if (c < 0xc2)
|
||||
goto invalid;
|
||||
else if (c < 0xe0)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[2] = c;
|
||||
count = 3;
|
||||
int c1 = phase1_getc ();
|
||||
if (c1 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c1 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c1 ^ 0x80) < 0x40)
|
||||
uc = ((unsigned int) (c & 0x1f) << 6)
|
||||
| (unsigned int) (c1 ^ 0x80);
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
|
||||
if (buf[0] >= 0xf0
|
||||
&& ((buf[1] ^ 0x80) < 0x40)
|
||||
&& ((buf[2] ^ 0x80) < 0x40))
|
||||
else if (c < 0xf0)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[3] = c;
|
||||
count = 4;
|
||||
int c1 = phase1_getc ();
|
||||
if (c1 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c1 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c1 ^ 0x80) < 0x40
|
||||
&& (c >= 0xe1 || c1 >= 0xa0)
|
||||
&& (c != 0xed || c1 < 0xa0))
|
||||
{
|
||||
int c2 = phase1_getc ();
|
||||
if (c2 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c2 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c2 ^ 0x80) < 0x40)
|
||||
uc = ((unsigned int) (c & 0x0f) << 12)
|
||||
| ((unsigned int) (c1 ^ 0x80) << 6)
|
||||
| (unsigned int) (c2 ^ 0x80);
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
|
||||
if (buf[0] >= 0xf8
|
||||
&& ((buf[1] ^ 0x80) < 0x40)
|
||||
&& ((buf[2] ^ 0x80) < 0x40)
|
||||
&& ((buf[3] ^ 0x80) < 0x40))
|
||||
else if (c < 0xf8)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[4] = c;
|
||||
count = 5;
|
||||
int c1 = phase1_getc ();
|
||||
if (c1 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c1 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c1 ^ 0x80) < 0x40
|
||||
&& (c >= 0xf1 || c1 >= 0x90)
|
||||
&& (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
|
||||
{
|
||||
int c2 = phase1_getc ();
|
||||
if (c2 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c2 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c2 ^ 0x80) < 0x40)
|
||||
{
|
||||
int c3 = phase1_getc ();
|
||||
if (c3 == EOF)
|
||||
goto incomplete_at_eof;
|
||||
if (c3 == '\n')
|
||||
goto incomplete_at_eol;
|
||||
if ((c3 ^ 0x80) < 0x40)
|
||||
uc = ((unsigned int) (c & 0x07) << 18)
|
||||
| ((unsigned int) (c1 ^ 0x80) << 12)
|
||||
| ((unsigned int) (c2 ^ 0x80) << 6)
|
||||
| (unsigned int) (c3 ^ 0x80);
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
else
|
||||
goto invalid;
|
||||
}
|
||||
else
|
||||
goto invalid;
|
||||
|
||||
if (buf[0] >= 0xfc
|
||||
&& ((buf[1] ^ 0x80) < 0x40)
|
||||
&& ((buf[2] ^ 0x80) < 0x40)
|
||||
&& ((buf[3] ^ 0x80) < 0x40)
|
||||
&& ((buf[4] ^ 0x80) < 0x40))
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF)
|
||||
return UEOF;
|
||||
buf[5] = c;
|
||||
count = 6;
|
||||
}
|
||||
|
||||
u8_mbtouc (&uc, buf, count);
|
||||
return uc;
|
||||
}
|
||||
|
||||
invalid:
|
||||
/* An invalid multibyte sequence was encountered. */
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Invalid multibyte sequence.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number));
|
||||
exit (EXIT_FAILURE);
|
||||
|
||||
incomplete_at_eof:
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Incomplete multibyte sequence at end of file.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number));
|
||||
exit (EXIT_FAILURE);
|
||||
|
||||
incomplete_at_eol:
|
||||
multiline_error (xstrdup (""),
|
||||
xasprintf (_("\
|
||||
%s:%d: Incomplete multibyte sequence at end of line.\n\
|
||||
Please specify the correct source encoding through --from-code or through a\n\
|
||||
comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
|
||||
real_file_name, line_number - 1));
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* Supports max (9, UNINAME_MAX + 3) pushback characters. */
|
||||
@@ -603,7 +628,7 @@ set_current_file_source_encoding (const char *canon_encoding)
|
||||
#else
|
||||
error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
|
||||
_("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
|
||||
xgettext_global_source_encoding, po_charset_utf8,
|
||||
xgettext_current_file_source_encoding, po_charset_utf8,
|
||||
basename (program_name));
|
||||
#endif
|
||||
}
|
||||
@@ -1672,7 +1697,11 @@ extract_python (FILE *f,
|
||||
last_comment_line = -1;
|
||||
last_non_comment_line = -1;
|
||||
|
||||
xgettext_current_file_source_encoding = xgettext_global_source_encoding;
|
||||
/* For Python, the default source file encoding is UTF-8. This is specified
|
||||
in PEP 3120. */
|
||||
xgettext_current_file_source_encoding =
|
||||
(xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
|
||||
po_charset_utf8);
|
||||
#if HAVE_ICONV
|
||||
xgettext_current_file_source_iconv = xgettext_global_source_iconv;
|
||||
#endif
|
||||
|
||||
@@ -35,7 +35,9 @@
|
||||
#define _(str) gettext (str)
|
||||
|
||||
|
||||
/* Canonicalized encoding name for all input files. */
|
||||
/* Canonicalized encoding name for all input files.
|
||||
It can be NULL when the --from-code option has not been specified. In this
|
||||
case, the default (ASCII or UTF-8) depends on the programming language. */
|
||||
const char *xgettext_global_source_encoding;
|
||||
|
||||
#if HAVE_ICONV
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/* Keeping track of the encoding of strings to be extracted.
|
||||
Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
Copyright (C) 2001-2019 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@@ -49,7 +49,9 @@ extern char *non_ascii_error_message (lexical_context_ty lcontext,
|
||||
size_t line_number);
|
||||
|
||||
|
||||
/* Canonicalized encoding name for all input files. */
|
||||
/* Canonicalized encoding name for all input files.
|
||||
It can be NULL when the --from-code option has not been specified. In this
|
||||
case, the default (ASCII or UTF-8) depends on the programming language. */
|
||||
extern const char *xgettext_global_source_encoding;
|
||||
|
||||
#if HAVE_ICONV
|
||||
|
||||
@@ -347,7 +347,7 @@ main (int argc, char *argv[])
|
||||
|
||||
/* Set initial value of variables. */
|
||||
default_domain = MESSAGE_DOMAIN_DEFAULT;
|
||||
xgettext_global_source_encoding = po_charset_ascii;
|
||||
xgettext_global_source_encoding = NULL;
|
||||
init_flag_table_c ();
|
||||
init_flag_table_objc ();
|
||||
init_flag_table_gcc_internal ();
|
||||
@@ -768,7 +768,8 @@ xgettext cannot work without keywords to look for"));
|
||||
|
||||
/* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
|
||||
from ASCII or UTF-8, when this conversion is a no-op). */
|
||||
if (xgettext_global_source_encoding != po_charset_ascii
|
||||
if (xgettext_global_source_encoding != NULL
|
||||
&& xgettext_global_source_encoding != po_charset_ascii
|
||||
&& xgettext_global_source_encoding != po_charset_utf8)
|
||||
{
|
||||
#if HAVE_ICONV
|
||||
@@ -965,7 +966,8 @@ xgettext cannot work without keywords to look for"));
|
||||
|
||||
/* Free the allocated converter. */
|
||||
#if HAVE_ICONV
|
||||
if (xgettext_global_source_encoding != po_charset_ascii
|
||||
if (xgettext_global_source_encoding != NULL
|
||||
&& xgettext_global_source_encoding != po_charset_ascii
|
||||
&& xgettext_global_source_encoding != po_charset_utf8)
|
||||
iconv_close (xgettext_global_source_iconv);
|
||||
#endif
|
||||
@@ -1769,7 +1771,9 @@ extract_from_file (const char *file_name, extractor_ty extractor,
|
||||
|
||||
/* Set the default for the source file encoding. May be overridden by
|
||||
the extractor function. */
|
||||
xgettext_current_source_encoding = xgettext_global_source_encoding;
|
||||
xgettext_current_source_encoding =
|
||||
(xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
|
||||
po_charset_ascii);
|
||||
#if HAVE_ICONV
|
||||
xgettext_current_source_iconv = xgettext_global_source_iconv;
|
||||
#endif
|
||||
|
||||
@@ -19,6 +19,12 @@ cat <<\EOF > xg-py-3b.py
|
||||
print gettext.gettext("ÆüËܸì");
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-py-3u.py
|
||||
#!/usr/bin/env python
|
||||
# TRANSLATORS: François Pinard is a hero.
|
||||
print gettext.gettext("日本語");
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-py-3.ok
|
||||
# SOME DESCRIPTIVE TITLE.
|
||||
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
|
||||
@@ -68,6 +74,17 @@ cat xg-py-3b.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3b.
|
||||
|
||||
${DIFF} xg-py-3.ok xg-py-3b.pot || Exit 1
|
||||
|
||||
# Verify that if the source file has no magic coding comment but is UTF-8
|
||||
# encoded, xgettext succeeds. (PEP 3120)
|
||||
|
||||
${XGETTEXT} --add-comments=TRANSLATORS: --no-location \
|
||||
-o xg-py-3u.tmp xg-py-3u.py || Exit 1
|
||||
# Don't simplify this to "grep ... < xg-py-3u.tmp", otherwise OpenBSD 4.0 grep
|
||||
# only outputs "Binary file (standard input) matches".
|
||||
cat xg-py-3u.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3u.pot
|
||||
|
||||
${DIFF} xg-py-3.ok xg-py-3u.pot || Exit 1
|
||||
|
||||
# Verify that if the source file has a magic coding comment and a --from-code
|
||||
# option is given, the magic coding comment takes precedence over it.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user