-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes for Encode::utf8 #97
Changes from all commits
dc4214d
c1bbfc0
2fd6264
bb60a14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) | |
#define SvIV_nomg SvIV | ||
#endif | ||
|
||
#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE | ||
# define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE | ||
#else | ||
# define UTF8_ALLOW_STRICT 0 | ||
#endif | ||
|
||
#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ | ||
~(UTF8_ALLOW_CONTINUATION | \ | ||
UTF8_ALLOW_NON_CONTINUATION | \ | ||
UTF8_ALLOW_LONG)) | ||
|
||
static void | ||
Encode_XSEncoding(pTHX_ encode_t * enc) | ||
{ | ||
|
@@ -114,6 +103,7 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) | |
|
||
#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" | ||
#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" | ||
#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode" | ||
|
||
static SV * | ||
do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) | ||
|
@@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) | |
return retval; | ||
} | ||
|
||
static SV * | ||
do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb) | ||
{ | ||
dSP; | ||
int argc; | ||
STRLEN i; | ||
SV *retval = newSVpvn("",0); | ||
ENTER; | ||
SAVETMPS; | ||
PUSHMARK(sp); | ||
for (i=0; i<slen; ++i) | ||
XPUSHs(sv_2mortal(newSVuv(s[i]))); | ||
PUTBACK; | ||
argc = call_sv(fallback_cb, G_SCALAR); | ||
SPAGAIN; | ||
if (argc != 1){ | ||
croak("fallback sub must return scalar!"); | ||
} | ||
sv_catsv(retval, POPs); | ||
PUTBACK; | ||
FREETMPS; | ||
LEAVE; | ||
return retval; | ||
} | ||
|
||
static SV * | ||
encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen, | ||
int check, STRLEN * offset, SV * term, int * retcode, | ||
|
@@ -382,7 +397,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) | |
U8 *ptr = s; | ||
bool overflowed = 0; | ||
|
||
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len); | ||
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s)); | ||
|
||
len--; | ||
s++; | ||
|
@@ -417,6 +432,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, | |
int check; | ||
U8 *d; | ||
STRLEN dlen; | ||
char esc[80]; /* need to store UTF8SKIP * 6 + 1 */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid magic numbers if possible. Why not use esc[UTF8_MAXBYTES * 6 + 1]? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats a good idea, declaring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
int i; | ||
|
||
if (SvROK(check_sv)) { | ||
/* croak("UTF-8 decoder doesn't support callback CHECK"); */ | ||
|
@@ -475,40 +492,56 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, | |
} | ||
|
||
/* If we get here there is something wrong with alleged UTF-8 */ | ||
/* uv is used only when encoding */ | ||
malformed_byte: | ||
uv = (UV)*s; | ||
if (ulen == 0) | ||
if (uv == 0) | ||
uv = (UV)*s; | ||
if (encode || ulen == 0) | ||
ulen = 1; | ||
|
||
malformed: | ||
if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ))) | ||
for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]); | ||
if (check & ENCODE_DIE_ON_ERR){ | ||
if (encode) | ||
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); | ||
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8")); | ||
else | ||
Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); | ||
Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc); | ||
} | ||
if (check & ENCODE_WARN_ON_ERR){ | ||
if (encode) | ||
Perl_warner(aTHX_ packWARN(WARN_UTF8), | ||
ERR_ENCODE_NOMAP, uv, "utf8"); | ||
ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8")); | ||
else | ||
Perl_warner(aTHX_ packWARN(WARN_UTF8), | ||
ERR_DECODE_NOMAP, "utf8", uv); | ||
ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc); | ||
} | ||
if (check & ENCODE_RETURN_ON_ERR) { | ||
break; | ||
} | ||
if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ | ||
SV* subchar = | ||
(fallback_cb != &PL_sv_undef) | ||
? do_fallback_cb(aTHX_ uv, fallback_cb) | ||
: newSVpvf(check & ENCODE_PERLQQ | ||
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}") | ||
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";" | ||
: "&#x%" UVxf ";", uv); | ||
if (encode){ | ||
SvUTF8_off(subchar); /* make sure no decoded string gets in */ | ||
} | ||
SV* subchar; | ||
if (encode) { | ||
subchar = | ||
(fallback_cb != &PL_sv_undef) | ||
? do_fallback_cb(aTHX_ uv, fallback_cb) | ||
: newSVpvf(check & ENCODE_PERLQQ | ||
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}") | ||
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";" | ||
: "&#x%" UVxf ";", uv); | ||
SvUTF8_off(subchar); /* make sure no decoded string gets in */ | ||
} else { | ||
if (fallback_cb != &PL_sv_undef) { | ||
/* in decode mode we have sequence of wrong bytes */ | ||
subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb); | ||
} else { | ||
char *ptr = esc; | ||
/* ENCODE_PERLQQ is already stored in esc */ | ||
if (check & (ENCODE_HTMLCREF|ENCODE_XMLCREF)) | ||
for (i=0; i<ulen; ++i) ptr += sprintf(ptr, ((check & ENCODE_HTMLCREF) ? "&#%u;" : "&#x%02X;"), s[i]); | ||
subchar = newSVpvn(esc, strlen(esc)); | ||
} | ||
} | ||
dlen += SvCUR(subchar) - ulen; | ||
SvCUR_set(dst, d-(U8 *)SvPVX(dst)); | ||
*SvEND(dst) = '\0'; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use strict; | ||
use warnings; | ||
|
||
use Test::More; | ||
use Encode qw(encode decode FB_CROAK LEAVE_SRC); | ||
|
||
plan tests => 12; | ||
|
||
my @invalid; | ||
|
||
ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8'; | ||
like $@, qr/^"\\x\{d800\}" does not map to UTF-8 /, 'Error message contains strict UTF-8 name'; | ||
@invalid = (); | ||
encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; }); | ||
is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800'; | ||
|
||
ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder'; | ||
like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; | ||
@invalid = (); | ||
decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; }); | ||
is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80'; | ||
|
||
ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder'; | ||
like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; | ||
@invalid = (); | ||
decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; }); | ||
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0'; | ||
|
||
ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder'; | ||
like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence'; | ||
decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; }); | ||
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0'; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The allocation of retval here would cause a memory leak if the callback invoked in
call_sv
throws an exception.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Init code is same as in do_fallback_cb, I just copied it. So it needs to be fixed on both places. Any idea how to easily fix in Encode?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What have we learned? Duplicated code is always a bad idea? Defer the allocation until successful execution of callback?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, but in this case when it is needed to call
call_sv
with different argument count and types we cannot do nothing more... maybe just move those perl macros like ENTER or PUSHMARK into another macros, but that would lead to even complicated code.Now when I'm looking at
do_fallback_cb
again, that function just returns new (non-mortal) SV* which is return value from perl function called viacall_sv
. So what is needed is to propagate mortal SV* in POPs from ENTER/LEAVE section for future processing. What is correct and safe way to do that? Callingretval = newSVsv(POPs)
prior to FREETMPS?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
retval = newSVsv(POPs);
priorFREETMPS
would be correct. You could also make it mortal (sv_2mortal()
) prior returning the retval and remove the call toSvREFCNT_dec
in the caller code, this would make the code a bit safer since it's always preferable to work with mortal SV's if possible, on a second look this would require a bit of refactoring.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not a good idea to make it mortal as mortal variables are freed later, when leaving encode/decode XS function. Function do_fallback_cb can be called too many times, each time for invalid character in input string. And input string is lot of times untrusted and can comes from attacker.
I tried it with mortal variables and following code
perl -MEncode -e 'my $b = "\x{ff}" x 10000000; decode("UTF-8", $b, sub { "$_[0]" })'
eats 825MB. Without mortal variables only 58MB. Which is a huge difference.
I looked at other XS code available in perl repository and it uses following pattern:
Looking into perl source code, FREETMPS just remove tmp flag and decrements refcount of all variables placed in tmp stack and let stack empty. So SvREFCNT_inc prevents destroying retval in FREETMPS and make it as normal non-mortal variable. Looks like it could save some memory as new variable is not needed to allocate.
But reusing SV from POPs or creating new variable via newSVsv() is not enough. That Encode code expects that returned SV has valid PV slot. So this is reason why such construct
newSVpvn("",0); ... sv_catsv(retval, POPs);
is used.I will try to cleanup this code and will use above pattern.