Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use code unit instead of code point #961

Merged
merged 1 commit into from
Mar 18, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions jerry-core/ecma/builtin-objects/ecma-builtin-global.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2014-2015 Samsung Electronics Co., Ltd.
/* Copyright 2014-2016 Samsung Electronics Co., Ltd.
* Copyright 2015-2016 University of Szeged.
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -96,25 +96,25 @@ ecma_builtin_global_object_print (ecma_value_t this_arg __attr_unused___, /**< t

while (utf8_str_curr_p < utf8_str_end_p)
{
ecma_char_t code_point = lit_utf8_read_next (&utf8_str_curr_p);
ecma_char_t code_unit = lit_utf8_read_next (&utf8_str_curr_p);

if (code_point == LIT_CHAR_NULL)
if (code_unit == LIT_CHAR_NULL)
{
printf ("\\u0000");
}
else if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
else if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
{
printf ("%c", (char) code_point);
printf ("%c", (char) code_unit);
}
else
{
JERRY_STATIC_ASSERT (sizeof (code_point) == 2,
JERRY_STATIC_ASSERT (sizeof (code_unit) == 2,
size_of_code_point_must_be_equal_to_2_bytes);

uint32_t byte_high = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_point,
uint32_t byte_high = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_unit,
JERRY_BITSINBYTE,
JERRY_BITSINBYTE);
uint32_t byte_low = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_point,
uint32_t byte_low = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_unit,
0,
JERRY_BITSINBYTE);

Expand Down Expand Up @@ -801,9 +801,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
continue;
}

lit_code_point_t decoded_byte;
ecma_char_t decoded_byte;

if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
{
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG (""));
break;
Expand Down Expand Up @@ -857,9 +857,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
continue;
}

lit_code_point_t decoded_byte;
ecma_char_t decoded_byte;

if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
{
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG (""));
break;
Expand Down Expand Up @@ -916,16 +916,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
}
else
{
lit_code_point_t cp;
ecma_char_t chr;

if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &cp)
|| ((cp & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
|| ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
{
is_valid = false;
break;
}

octets[i] = (lit_utf8_byte_t) cp;
octets[i] = (lit_utf8_byte_t) chr;
input_char_p += URI_ENCODED_BYTE_SIZE;
}
}
Expand Down
6 changes: 3 additions & 3 deletions jerry-core/ecma/builtin-objects/ecma-builtin-json.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
}
case LIT_CHAR_LOWERCASE_U:
{
lit_code_point_t code_point;
ecma_char_t code_unit;

if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
if (!(lit_read_code_unit_from_hex (current_p + 1, 4, &code_unit)))
{
return;
}

current_p += 5;
write_p += lit_code_point_to_cesu8 (code_point, write_p);
write_p += lit_code_unit_to_utf8 (code_unit, write_p);
continue;
}
default:
Expand Down
25 changes: 13 additions & 12 deletions jerry-core/lit/lit-char-helpers.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright 2015 Samsung Electronics Co., Ltd.
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
* Copyright 2016 University of Szeged.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -283,38 +284,38 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to

/**
* Parse the next number_of_characters hexadecimal character,
* and construct a code point from them. The buffer must
* and construct a code unit from them. The buffer must
* be zero terminated.
*
* @return true if decoding was successful, false otherwise
*/
bool
lit_read_code_point_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with characters */
lit_utf8_size_t number_of_characters, /**< number of characters to be read */
lit_code_point_t *out_code_point_p) /**< [out] decoded result */
lit_read_code_unit_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with characters */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doc still refers to code points.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the help of the non-character markers (as in my previous comment), couldn't we just return a code unit from this function and compare it to U+FFFF to see whether we succeeded or not? (I'm really against out parameters given with pointers, so prefer to replace them wherever possible.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. The helper must be able to read "FFFF" and does not have to deal with it a valid code unit or not. Currently it might work on the test suite but this change would be dangerous.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just sweeped through the call sites of this function, and it seems that only three functions call it with number_of_characters being 4: ecma_builtin_json_parse_string, re_parse_char_class, and re_parse_next_token. (Where the param is 2, there can be no possible way of getting a valid value of 0xFFFF.) Also, in a valid json, there must not be a U+FFFF according to the unicode specs. Where I'm not completely sure is the regexes. :/ While googling around I've seen some artificial examples with regex character classes where the upper limit was \uFFFF. What I couldn't find out with certainty is whether they are valid examples.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JS supports all characters between 0-0xffff. JS was not designed to be UTF compatible, every character is a two byte number.

lit_utf8_size_t number_of_characters, /**< number of characters to be read */
ecma_char_ptr_t out_code_unit_p) /**< [out] decoded result */
{
lit_code_point_t code_point = 0;
ecma_char_t code_unit = LIT_CHAR_NULL;

JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);

for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
{
code_point <<= 4;
code_unit = (ecma_char_t) (code_unit << 4u);

if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
&& *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
{
code_point |= (uint32_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
}
else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
&& *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
{
code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
}
else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
&& *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
{
code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
}
else
{
Expand All @@ -324,9 +325,9 @@ lit_read_code_point_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with character
buf_p++;
}

*out_code_point_p = code_point;
*out_code_unit_p = code_unit;
return true;
} /* lit_read_code_point_from_hex */
} /* lit_read_code_unit_from_hex */

/**
* Check if specified character is a word character (part of IsWordChar abstract operation)
Expand Down
5 changes: 4 additions & 1 deletion jerry-core/lit/lit-char-helpers.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
* Copyright 2016 University of Szeged.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +19,8 @@

#include "lit-globals.h"

#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */

/*
* Format control characters (ECMA-262 v5, Table 1)
*/
Expand Down Expand Up @@ -213,7 +216,7 @@ extern bool lit_char_is_hex_digit (ecma_char_t);
extern uint32_t lit_char_hex_to_int (ecma_char_t);

/* read a hex encoded code point from a zero terminated buffer */
bool lit_read_code_point_from_hex (lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *);
bool lit_read_code_unit_from_hex (lit_utf8_byte_t *, lit_utf8_size_t, ecma_char_ptr_t);

/**
* Null character
Expand Down
8 changes: 4 additions & 4 deletions jerry-core/parser/regexp/re-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@
*/
static void
re_append_char_class (void *re_ctx_p, /**< RegExp compiler context */
uint32_t start, /**< character class range from */
uint32_t end) /**< character class range to */
ecma_char_t start, /**< character class range from */
ecma_char_t end) /**< character class range to */
{
re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t *) re_ctx_p;
re_append_char (ctx_p->bytecode_ctx_p, (ecma_char_t) start);
re_append_char (ctx_p->bytecode_ctx_p, (ecma_char_t) end);
re_append_char (ctx_p->bytecode_ctx_p, start);
re_append_char (ctx_p->bytecode_ctx_p, end);
ctx_p->parser_ctx_p->num_of_classes++;
} /* re_append_char_class */

Expand Down
Loading