diff --git a/encoding.pm b/encoding.pm index c0bff08..167c2fb 100644 --- a/encoding.pm +++ b/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $ +# $Id: encoding.pm,v 2.13 2013/04/26 18:30:46 dankogai Exp $ package encoding; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.13 $ =~ /(\d+)/g; use Encode; use strict; @@ -131,11 +131,23 @@ sub import { $name = $enc->name; # canonize unless ( $arg{Filter} ) { DEBUG and warn "_exception($name) = ", _exception($name); - _exception($name) or ${^ENCODING} = $enc; + if (! _exception($name)) { + if (!$^V || $^V lt v5.21.7) { + ${^ENCODING} = $enc; + } + else { + # Starting with 5.21.7, this pragma uses a shadow variable + # designed explicitly for it, ${^E_NCODING}, to enforce + # lexical scope; instead of ${^ENCODING}. + $^H{'encoding'} = 1; + ${^E_NCODING} = $enc; + } + } $HAS_PERLIO or return 1; } else { defined( ${^ENCODING} ) and undef ${^ENCODING}; + undef ${^E_NCODING} if $^V && $^V ge v5.21.7; # implicitly 'use utf8' require utf8; # to fetch $utf8::hint_bits; @@ -185,6 +197,7 @@ sub import { sub unimport { no warnings; undef ${^ENCODING}; + undef ${^E_NCODING} if $^V && $^V ge v5.21.7; if ($HAS_PERLIO) { binmode( STDIN, ":raw" ); binmode( STDOUT, ":raw" ); @@ -205,20 +218,12 @@ __END__ =head1 NAME -encoding - allows you to write your script in non-ascii or non-utf8 +encoding - allows you to write your script in non-ASCII and non-UTF-8 =head1 WARNING -This module is deprecated under perl 5.18. It uses a mechanism provided by -perl that is deprecated under 5.18 and higher, and may be removed in a -future version. - -The easiest and the best alternative is to write your script in UTF-8 -and declear: - - use utf8; # not use encoding ':utf8'; - -See L and L for details. +This module has been deprecated since perl v5.18. See L and +L. =head1 SYNOPSIS @@ -235,68 +240,84 @@ See L and L for details. # A simple euc-cn => utf-8 converter use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; - # "no encoding;" supported (but not scoped!) + # "no encoding;" supported no encoding; # an alternate way, Filter use encoding "euc-jp", Filter=>1; # now you can use kanji identifiers -- in euc-jp! - # switch on locale - - # note that this probably means that unless you have a complete control - # over the environments the application is ever going to be run, you should - # NOT use the feature of encoding pragma allowing you to write your script - # in any recognized encoding because changing locale settings will wreck - # the script; you can of course still use the other features of the pragma. + # encode based on the current locale - specialized purposes only; + # fraught with danger!! use encoding ':locale'; -=head1 ABSTRACT +=head1 DESCRIPTION -Let's start with a bit of history: Perl 5.6.0 introduced Unicode -support. You could apply C and regexes even to complex CJK -characters -- so long as the script was written in UTF-8. But back -then, text editors that supported UTF-8 were still rare and many users -instead chose to write scripts in legacy encodings, giving up a whole -new feature of Perl 5.6. +This pragma is used to enable a Perl script to be written in encodings that +aren't strictly ASCII nor UTF-8. It translates all or portions of the Perl +program script from a given encoding into UTF-8, and changes the PerlIO layers +of C and C to the encoding specified. -Rewind to the future: starting from perl 5.8.0 with the B -pragma, you can write your script in any encoding you like (so long -as the C module supports it) and still enjoy Unicode support. -This pragma achieves that by doing the following: +This pragma dates from the days when UTF-8-enabled editors were uncommon. But +that was long ago, and the need for it is greatly diminished. That, coupled +with the fact that it doesn't work with threads, along with other problems, +(see L) have led to its being deprecated. It is planned to remove this +pragma in a future Perl version. New code should be written in UTF-8, and the +C pragma used instead (see L and L for details). +Old code should be converted to UTF-8, via something like the recipe in the +L (though this simple approach may require manual adjustments +afterwards). -=over +The only legitimate use of this pragma is almost certainly just one per file, +near the top, with file scope, as the file is likely going to only be written +in one encoding. Further restrictions apply in Perls before v5.22 (see +L). -=item * +There are two basic modes of operation (plus turning if off): -Internally converts all literals (C) from -the encoding specified to utf8. In Perl 5.8.1 and later, literals in -C and C pseudo-filehandle are also converted. +=over 4 -=item * +=item C'] ;> -Changing PerlIO layers of C and C to the encoding - specified. +This is the normal operation. It translates various literals encountered in +the Perl source file from the encoding I into UTF-8, and similarly +converts character code points. This is used when the script is a combination +of ASCII (for the variable names and punctuation, I), but the literal +data is in the specified encoding. -=back +I is optional. If omitted, the encoding specified in the environment +variable L|perlrun/PERL_ENCODING> is used. If this isn't +set, or the resolved-to encoding is not known to C>, the error +C'> will be thrown. -=head2 Literal Conversions +Starting in Perl v5.8.6 (C version 2.0.1), I may be the +name C<:locale>. This is for very specialized applications, and is documented +in L sub-pragma> below. -You can write code in EUC-JP as follows: +The literals that are converted are C, and +starting in v5.8.1, C. Operations that do conversions include C, +C, C (but not C), and C. + +Also starting in v5.8.1, the C pseudo-filehandle is translated from the +encoding into UTF-8. + +For example, you can write code in EUC-JP as follows: my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji #<-char-><-char-> # 4 octets s/\bCamel\b/$Rakuda/; And with C in effect, it is the same thing as -the code in UTF-8: +that code in UTF-8: my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters s/\bCamel\b/$Rakuda/; -=head2 PerlIO layers for C +See L below for a more complete example. -The B pragma also modifies the filehandle layers of -STDIN and STDOUT to the specified encoding. Therefore, +Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the +PerlIO layers of C and C are set to "C<:encoding(I)>". +Therefore, use encoding "euc-jp"; my $message = "Camel is the symbol of perl.\n"; @@ -304,183 +325,145 @@ STDIN and STDOUT to the specified encoding. Therefore, $message =~ s/\bCamel\b/$Rakuda/; print $message; -Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", -not "\x{99F1}\x{99DD} is the symbol of perl.\n". - -You can override this by giving extra arguments; see below. - -=head2 Implicit upgrading for byte strings - -By default, if strings operating under byte semantics and strings -with Unicode character data are concatenated, the new string will -be created by decoding the byte strings as I. - -The B pragma changes this to use the specified encoding -instead. For example: +will print - use encoding 'utf8'; - my $string = chr(20000); # a Unicode string - utf8::encode($string); # now it's a UTF-8 encoded byte string - # concatenate with another Unicode string - print length($string . chr(20000)); + "\xF1\xD1\xF1\xCC is the symbol of perl.\n" -Will print C<2>, because C<$string> is upgraded as UTF-8. Without -C, it will print C<4> instead, since C<$string> -is three octets when interpreted as Latin-1. +not -=head2 Side effects + "\x{99F1}\x{99DD} is the symbol of perl.\n" -If the C pragma is in scope then the lengths returned are -calculated from the length of C<$/> in Unicode characters, which is not -always the same as the length of C<$/> in the native encoding. +You can override this by giving extra arguments; see below. -This pragma affects utf8::upgrade, but not utf8::downgrade. +Note that C WILL NOT be changed, regardless. -=head1 FEATURES THAT REQUIRE 5.8.1 +Also note that non-STD file handles remain unaffected. Use C or C to change the layers of those. -Some of the features offered by this pragma requires perl 5.8.1. Most -of these are done by Inaba Hiroto. Any other features and changes -are good for 5.8.0. +=item C Filter=E1;> -=over +This operates as above, but the C argument with a non-zero +value causes the entire script, and not just literals, to be translated from +the encoding into UTF-8. This allows identifiers in the source to be in that +encoding as well. (Problems may occur if the encoding is not a superset of +ASCII; imagine all your semi-colons being translated into something +different.) One can use this form to make -=item "NON-EUC" doublebyte encodings + ${"\x{4eba}"}++ -Because perl needs to parse script before applying this pragma, such -encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; -\x5c) in the second byte fails because the second byte may -accidentally escape the quoting character that follows. Perl 5.8.1 -or later fixes this problem. +work. (This is equivalent to C<$I++>, where I is a single Han +ideograph). -=item tr// +This effectively means that your source code behaves as if it were written in +UTF-8 with C<'use utf8>' in effect. So even if your editor only supports +Shift_JIS, for example, you can still try examples in Chapter 15 of +C. -C was overlooked by Perl 5 porters when they released perl 5.8.0 -See the section below for details. +This option is significantly slower than the other one. -=item DATA pseudo-filehandle +=item C -Another feature that was overlooked was C. +Unsets the script encoding. The layers of C, C are +reset to "C<:raw>" (the default unprocessed raw stream of bytes). =back -=head1 USAGE +=head1 OPTIONS -=over 4 +=head2 Setting C and/or C individually -=item use encoding [I] ; +The encodings of C and C are individually settable by parameters to +the pragma: -Sets the script encoding to I. And unless ${^UNICODE} -exists and non-zero, PerlIO layers of STDIN and STDOUT are set to -":encoding(I)". + use encoding 'euc-tw', STDIN => 'greek' ...; -Note that STDERR WILL NOT be changed. +In this case, you cannot omit the first I. C<< STDIN => undef >> +turns the I/O transcoding completely off for that filehandle. -Also note that non-STD file handles remain unaffected. Use C or C to change layers of those. +When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, +these options will be completely ignored. See L> and +L<"C<-C>" in perlrun|perlrun/-C [numberElist]> for details. -If no encoding is specified, the environment variable L -is consulted. If no encoding can be found, the error C'> will be thrown. +=head2 The C<:locale> sub-pragma -=item use encoding I [ STDIN =E I ...] ; +Starting in v5.8.6, the encoding name may be C<:locale>. This means that the +encoding is taken from the current locale, and not hard-coded by the pragma. +Since a script really can only be encoded in exactly one encoding, this option +is dangerous. It makes sense only if the script itself is written in ASCII, +and all the possible locales that will be in use when the script is executed +are supersets of ASCII. That means that the script itself doesn't get +changed, but the I/O handles have the specified encoding added, and the +operations like C and C use that encoding. -You can also individually set encodings of STDIN and STDOUT via the -C<< STDIN => I >> form. In this case, you cannot omit the -first I. C<< STDIN => undef >> turns the IO transcoding -completely off. +The logic of finding which locale C<:locale> uses is as follows: -When ${^UNICODE} exists and non-zero, these options will completely -ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See -L see L and L for -details (perl 5.8.1 and later). +=over 4 -=item use encoding I Filter=E1; +=item 1. -This turns the encoding pragma into a source filter. While the -default approach just decodes interpolated literals (in qq() and -qr()), this will apply a source filter to the entire source code. See -L below for details. +If the platform supports the C interface, the codeset +returned is used as the default encoding for the open pragma. -=item no encoding; +=item 2. -Unsets the script encoding. The layers of STDIN, STDOUT are -reset to ":raw" (the default unprocessed raw stream of bytes). +If 1. didn't work but we are under the locale pragma, the environment +variables C and C (in that order) are matched for encodings +(the part after "C<.>", if any), and if any found, that is used +as the default encoding for the open pragma. -=back +=item 3. -=head1 The Filter Option +If 1. and 2. didn't work, the environment variables C and C +(in that order) are matched for anything looking like UTF-8, and if +any found, C<:utf8> is used as the default encoding for the open +pragma. -The magic of C is not applied to the names of -identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human -is a single Han ideograph) work, you still need to write your script -in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. +=back -What does this mean? Your source code behaves as if it is written in -UTF-8 with 'use utf8' in effect. So even if your editor only supports -Shift_JIS, for example, you can still try examples in Chapter 15 of -C. For instance, you can use UTF-8 -identifiers. +If your locale environment variables (C, C, C) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your C, C, and C, and of +B, is UTF-8. -This option is significantly slower and (as of this writing) non-ASCII -identifiers are not very stable WITHOUT this option and with the -source code written in UTF-8. +=head1 CAVEATS -=head2 Filter-related changes at Encode version 1.87 +=head2 SIDE EFFECTS =over =item * -The Filter option now sets STDIN and STDOUT like non-filter options. -And C<< STDIN=>I >> and C<< STDOUT=>I >> work like -non-filter version. +If the C pragma is in scope then the lengths returned are +calculated from the length of C<$/> in Unicode characters, which is not +always the same as the length of C<$/> in the native encoding. =item * -C is implicitly declared so you no longer have to C to C<${"\x{4eba}"}++>. - -=back - -=head1 CAVEATS - -=head2 NOT SCOPED - -The pragma is a per script, not a per block lexical. Only the last -C or C matters, and it affects -B. However, the pragma is supported and -B can appear as many times as you want in a given script. -The multiple use of this pragma is discouraged. - -By the same reason, the use this pragma inside modules is also -discouraged (though not as strongly discouraged as the case above. -See below). - -If you still have to write a module with this pragma, be very careful -of the load order. See the codes below; +Without this pragma, if strings operating under byte semantics and strings +with Unicode character data are concatenated, the new string will +be created by decoding the byte strings as I. - # called module - package Module_IN_BAR; - use encoding "bar"; - # stuff in "bar" encoding here - 1; +The B pragma changes this to use the specified encoding +instead. For example: - # caller script - use encoding "foo" - use Module_IN_BAR; - # surprise! use encoding "bar" is in effect. + use encoding 'utf8'; + my $string = chr(20000); # a Unicode string + utf8::encode($string); # now it's a UTF-8 encoded byte string + # concatenate with another Unicode string + print length($string . chr(20000)); -The best way to avoid this oddity is to use this pragma RIGHT AFTER -other modules are loaded. i.e. +Will print C<2>, because C<$string> is upgraded as UTF-8. Without +C, it will print C<4> instead, since C<$string> +is three octets when interpreted as Latin-1. - use Module_IN_BAR; - use encoding "foo"; +=back =head2 DO NOT MIX MULTIPLE ENCODINGS Notice that only literals (string or regular expression) having only legacy code points are affected: if you mix data like this + \x{100}\xDF \xDF\x{100} the data is assumed to be in (Latin 1 and) Unicode, not in your native @@ -509,10 +492,70 @@ resort to \x{....} just to spell your name in a native encoding. So feel free to put your strings in your encoding in quotes and regexes. -=head2 tr/// with ranges +=head2 Prior to Perl v5.22 + +The pragma was a per script, not a per block lexical. Only the last +C or C mattered, and it affected +B. However, the C pragma was supported and +C could appear as many times as you want in a given script +(though only the last was effective). + +Since the scope wasn't lexical, other modules' use of C, C, I +were affected. This leads to spooky, incorrect action at a distance that is +hard to debug. + +This means you would have to be very careful of the load order: + + # called module + package Module_IN_BAR; + use encoding "bar"; + # stuff in "bar" encoding here + 1; + + # caller script + use encoding "foo" + use Module_IN_BAR; + # surprise! use encoding "bar" is in effect. + +The best way to avoid this oddity is to use this pragma RIGHT AFTER +other modules are loaded. i.e. + + use Module_IN_BAR; + use encoding "foo"; + +=head2 Prior to Encode version 1.87 + +=over + +=item * + +C and C were not set under the filter option. +And C<< STDIN=>I >> and C<< STDOUT=>I >> didn't work like +non-filter version. + +=item * + +C wasn't implicitly declared so you have to C to do + + ${"\x{4eba}"}++ + +=back + +=head2 Prior to Perl v5.8.1 + +=over + +=item "NON-EUC" doublebyte encodings + +Because perl needs to parse the script before applying this pragma, such +encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH; +C<\x5c>) in the second byte fail because the second byte may +accidentally escape the quoting character that follows. + +=item C The B pragma works by decoding string literals in -C and so forth. In perl 5.8.0, this +C and so forth. In perl v5.8.0, this does not apply to C. Therefore, use encoding 'euc-jp'; @@ -537,25 +580,21 @@ Does not work as =back -This counterintuitive behavior has been fixed in perl 5.8.1. +This counterintuitive behavior has been fixed in perl v5.8.1. -=head3 workaround to tr///; - -In perl 5.8.0, you can work around as follows; +In perl v5.8.0, you can work around this as follows; use encoding 'euc-jp'; # .... eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; Note the C expression is surrounded by C. The idea behind -is the same as classic idiom that makes C 'interpolate'. +this is the same as the classic idiom that makes C 'interpolate': tr/$from/$to/; # wrong! eval qq{ tr/$from/$to/ }; # workaround. -Nevertheless, in case of B pragma even C is affected so -C not being decoded was obviously against the will of Perl5 -Porters so it has been fixed in Perl 5.8.1 or later. +=back =head1 EXAMPLE - Greekperl @@ -590,10 +629,24 @@ Porters so it has been fixed in Perl 5.8.1 or later. print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; -=head1 KNOWN PROBLEMS +=head1 BUGS =over +=item Thread safety + +C is not thread-safe (i.e., do not use in threaded +applications). + +=item Can't be used by more than one module in a single program. + +Only one encoding is allowed. If you combine modules in a program that have +different encodings, only one will be actually used. + +=item Other modules using C and C get the encoded stream + +They may be expecting something completely different. + =item literals in regex that are longer than 127 bytes For native multibyte encodings (either fixed or variable length), @@ -603,13 +656,11 @@ recoding errors for regular expression literals longer than 127 bytes. =item EBCDIC The encoding pragma is not supported on EBCDIC platforms. -(Porters who are willing and able to remove this limitation are -welcome.) -=item format +=item C -This pragma doesn't work well with format because PerlIO does not -get along very well with it. When format contains non-ascii +This pragma doesn't work well with C because PerlIO does not +get along very well with it. When C contains non-ASCII characters it prints funny or gets "wide character warnings". To understand it, try the code below. @@ -628,56 +679,19 @@ To understand it, try the code below. Without binmode this happens to work but without binmode, print() fails instead of write(). -At any rate, the very use of format is questionable when it comes to +At any rate, the very use of C is questionable when it comes to unicode characters since you have to consider such things as character width (i.e. double-width for ideographs) and directions (i.e. BIDI for Arabic and Hebrew). -=item Thread safety - -C is not thread-safe (i.e., do not use in threaded -applications). +=item See also L =back -=head2 The Logic of :locale - -The logic of C<:locale> is as follows: - -=over 4 - -=item 1. - -If the platform supports the langinfo(CODESET) interface, the codeset -returned is used as the default encoding for the open pragma. - -=item 2. - -If 1. didn't work but we are under the locale pragma, the environment -variables LC_ALL and LANG (in that order) are matched for encodings -(the part after C<.>, if any), and if any found, that is used -as the default encoding for the open pragma. - -=item 3. - -If 1. and 2. didn't work, the environment variables LC_ALL and LANG -(in that order) are matched for anything looking like UTF-8, and if -any found, C<:utf8> is used as the default encoding for the open -pragma. - -=back - -If your locale environment variables (LC_ALL, LC_CTYPE, LANG) -contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), -the default encoding of your STDIN, STDOUT, and STDERR, and of -B, is UTF-8. - =head1 HISTORY -This pragma first appeared in Perl 5.8.0. For features that require -5.8.1 and better, see above. - -The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. +This pragma first appeared in Perl v5.8.0. It has been enhanced in later +releases as specified above. =head1 SEE ALSO