Skip to content

Commit

Permalink
refactor(lexer): tighten safety of lexer by always including lifetime…
Browse files Browse the repository at this point in the history
… on `SourcePosition` (#8293)

Always use `SourcePosition<'a>` not `SourcePosition`, to ensure not reading from a `Source` with a `SourcePosition` which is from *another* `Source`. This doesn't definitively guard against UB - that is the job of `UniquePromise` - but it adds another level of defence.
  • Loading branch information
overlookmotel committed Jan 7, 2025
1 parent 16dcdaf commit 64bfdfe
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 14 deletions.
4 changes: 2 additions & 2 deletions crates/oxc_parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ static LINE_BREAK_TABLE: SafeByteMatchTable =
static MULTILINE_COMMENT_START_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'*' | b'\r' | b'\n' | LS_OR_PS_FIRST));

impl Lexer<'_> {
impl<'a> Lexer<'a> {
/// Section 12.4 Single Line Comment
pub(super) fn skip_single_line_comment(&mut self) -> Kind {
byte_search! {
Expand Down Expand Up @@ -151,7 +151,7 @@ impl Lexer<'_> {
Kind::Skip
}

fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition) -> Kind {
fn skip_multi_line_comment_after_line_break(&mut self, pos: SourcePosition<'a>) -> Kind {
// Can use `memchr` here as only searching for 1 pattern.
// Cache `Finder` instance on `Lexer` as there's a significant cost to creating it.
// `Finder::new` isn't a const function, so can't make it a `static`, and `lazy_static!`
Expand Down
9 changes: 6 additions & 3 deletions crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
/// Handle rest of identifier after first byte of a multi-byte Unicode char found.
/// Any number of characters can have already been consumed from `self.source` prior to it.
/// `self.source` should be positioned at start of Unicode character.
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition<'a>) -> &'a str {
let c = self.peek_char().unwrap();
if is_identifier_part_unicode(c) {
self.consume_char();
Expand All @@ -113,7 +113,10 @@ impl<'a> Lexer<'a> {
///
/// First char should have been consumed from `self.source` prior to calling this.
/// `start_pos` should be position of the start of the identifier (before first char was consumed).
pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
pub(super) fn identifier_tail_after_unicode(
&mut self,
start_pos: SourcePosition<'a>,
) -> &'a str {
// Identifier contains a Unicode chars, so probably contains more.
// So just iterate over chars now, instead of bytes.
while let Some(c) = self.peek_char() {
Expand Down Expand Up @@ -146,7 +149,7 @@ impl<'a> Lexer<'a> {
///
/// The `\` must not have be consumed from `lexer.source`.
/// `start_pos` must be position of start of identifier.
fn identifier_backslash(&mut self, start_pos: SourcePosition, is_start: bool) -> &'a str {
fn identifier_backslash(&mut self, start_pos: SourcePosition<'a>, is_start: bool) -> &'a str {
// Create arena string to hold unescaped identifier.
// We don't know how long identifier will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
Expand Down
22 changes: 14 additions & 8 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ impl<'a> Source<'a> {

/// Move current position.
#[inline]
pub(super) fn set_position(&mut self, pos: SourcePosition) {
pub(super) fn set_position(&mut self, pos: SourcePosition<'a>) {
// `SourcePosition` always upholds the invariants of `Source`, as long as it's created
// from this `Source`. `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
Expand Down Expand Up @@ -216,7 +216,7 @@ impl<'a> Source<'a> {
}

/// Get string slice from a `SourcePosition` up to the current position of `Source`.
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str {
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str {
assert!(pos.ptr <= self.ptr);
// SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements
unsafe { self.str_from_pos_to_current_unchecked(pos) }
Expand All @@ -230,7 +230,10 @@ impl<'a> Source<'a> {
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been advanced with `SourcePosition::add`.
#[inline]
pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str {
pub(super) unsafe fn str_from_pos_to_current_unchecked(
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
Expand All @@ -244,15 +247,18 @@ impl<'a> Source<'a> {
/// 1. `Source::set_position` has not been called since `pos` was created.
/// 2. `pos` has not been moved backwards with `SourcePosition::sub`.
#[inline]
pub(super) unsafe fn str_from_current_to_pos_unchecked(&self, pos: SourcePosition) -> &'a str {
pub(super) unsafe fn str_from_current_to_pos_unchecked(
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition) -> &'a str {
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary.
// `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
Expand All @@ -266,8 +272,8 @@ impl<'a> Source<'a> {
#[inline]
pub(super) unsafe fn str_between_positions_unchecked(
&self,
start: SourcePosition,
end: SourcePosition,
start: SourcePosition<'a>,
end: SourcePosition<'a>,
) -> &'a str {
// Check `start` is not after `end`
debug_assert!(start.ptr <= end.ptr);
Expand Down Expand Up @@ -304,7 +310,7 @@ impl<'a> Source<'a> {
/// Get offset of `pos`.
#[expect(clippy::cast_possible_truncation)]
#[inline]
pub(super) fn offset_of(&self, pos: SourcePosition) -> u32 {
pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 {
// Cannot overflow `u32` because of `MAX_LEN` check in `Source::new`
(pos.addr() - self.start as usize) as u32
}
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_parser/src/lexer/template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ impl<'a> Lexer<'a> {
/// # SAFETY
/// `pos` must not be before `self.source.position()`
#[expect(clippy::unnecessary_safety_comment)]
unsafe fn template_literal_create_string(&self, pos: SourcePosition) -> String<'a> {
unsafe fn template_literal_create_string(&self, pos: SourcePosition<'a>) -> String<'a> {
// Create arena string to hold modified template literal.
// We don't know how long template literal will end up being. Take a guess that total length
// will be double what we've seen so far, or `MIN_ESCAPED_TEMPLATE_LIT_LEN` minimum.
Expand Down

0 comments on commit 64bfdfe

Please sign in to comment.