encoding_parser.dart

#library('encoding_parser');

#import('constants.dart');
#import('inputstream.dart');
#import('utils.dart');

// TODO(jmesserly): I converted StopIteration to NoMoreElementsException. Seems
// strange to throw this from outside of an iterator though.
/**
 * String-like object with an associated position and various extra methods
 * If the position is ever greater than the string length then an exception is
 * raised.
 */
class EncodingBytes implements Iterable<String> {
  final String _bytes;
  int _position;

  EncodingBytes(String bytes) : _bytes = bytes, _position = -1;

  Iterator<String> iterator() => _bytes.splitChars().iterator();
  int get length() => _bytes.length;

  String next() {
    var p = _position = _position + 1;
    if (p >= length) {
      throw const NoMoreElementsException();
    } else if (p < 0) {
      throw new IndexOutOfRangeException(p);
    }
    return _bytes[p];
  }

  String previous() {
    var p = _position;
    if (p >= length) {
      throw const NoMoreElementsException();
    } else if (p < 0) {
      throw new IndexOutOfRangeException(p);
    }
    _position = p = p - 1;
    return _bytes[p];
  }

  set position(int value) {
    if (_position >= length) {
      throw const NoMoreElementsException();
    }
    _position = value;
  }

  int get position() {
    if (_position >= length) {
      throw const NoMoreElementsException();
    }
    if (_position >= 0) {
      return _position;
    } else {
      return 0;
    }
  }

  String get currentByte() => _bytes[position];

  /** Skip past a list of characters. Defaults to skipping [isWhitespace]. */
  String skip([CharPreciate skipChars]) {
    if (skipChars == null) skipChars = isWhitespace;
    var p = position;  // use property for the error-checking
    while (p < length) {
      var c = _bytes[p];
      if (!skipChars(c)) {
        _position = p;
        return c;
      }
      p += 1;
    }
    _position = p;
    return null;
  }

  String skipUntil(CharPreciate untilChars) {
    var p = position;
    while (p < length) {
      var c = _bytes[p];
      if (untilChars(c)) {
        _position = p;
        return c;
      }
      p += 1;
    }
    return null;
  }

  /**
   * Look for a sequence of bytes at the start of a string. If the bytes
   * are found return true and advance the position to the byte after the
   * match. Otherwise return false and leave the position alone.
   */
  bool matchBytes(String bytes) {
    var p = position;
    if (_bytes.length < p + bytes.length) {
      return false;
    }
    var data = _bytes.substring(p, p + bytes.length);
    if (data == bytes) {
      position += bytes.length;
      return true;
    }
    return false;
  }

  /**
   * Look for the next sequence of bytes matching a given sequence. If
   * a match is found advance the position to the last byte of the match
   */
  bool jumpTo(String bytes) {
    var newPosition = _bytes.indexOf(bytes, position);
    if (newPosition >= 0) {
      // XXX: This is ugly, but I can't see a nicer way to fix this.
      if (_position == -1) {
        _position = 0;
      }
      _position += newPosition + bytes.length - 1;
      return true;
    } else {
      throw const NoMoreElementsException();
    }
  }

  String slice(int start, [int end]) {
    if (end == null) end = length;
    if (end < 0) end += length;
    return _bytes.substring(start, end - start);
  }
}

/** Mini parser for detecting character encoding from meta elements. */
class EncodingParser {
  final EncodingBytes data;
  String encoding;

  /** [bytes] - the data to work on for encoding detection. */
  EncodingParser(List<int> bytes)
      // Note: this is intentionally interpreting bytes as codepoints.
      : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

  String getEncoding() {
    final methodDispatch = [
      ["<!--", handleComment],
      ["<meta", handleMeta],
      ["</", handlePossibleEndTag],
      ["<!", handleOther],
      ["<?", handleOther],
      ["<", handlePossibleStartTag]];

    try {
      for (var byte in data) {
        var keepParsing = true;
        for (var dispatch in methodDispatch) {
          if (data.matchBytes(dispatch[0])) {
            try {
              keepParsing = dispatch[1]();
              break;
            } catch (NoMoreElementsException e) {
              keepParsing = false;
              break;
            }
          }
        }
        if (!keepParsing) {
          break;
        }
      }
    } catch (NoMoreElementsException e) {
      // Catch this here to match behavior of Python's StopIteration
    }
    return encoding;
  }

  /** Skip over comments. */
  bool handleComment() => data.jumpTo("-->");

  bool handleMeta() {
    if (!isWhitespace(data.currentByte)) {
      // if we have <meta not followed by a space so just keep going
      return true;
    }
    // We have a valid meta element we want to search for attributes
    while (true) {
      // Try to find the next attribute after the current position
      var attr = getAttribute();
      if (attr === null) return true;

      if (attr[0] == "charset") {
        var tentativeEncoding = attr[1];
        var codec = codecName(tentativeEncoding);
        if (codec != null) {
          encoding = codec;
          return false;
        }
      } else if (attr[0] == "content") {
        var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
        var tentativeEncoding = contentParser.parse();
        var codec = codecName(tentativeEncoding);
        if (codec != null) {
          encoding = codec;
          return false;
        }
      }
    }
  }

  bool handlePossibleStartTag() => handlePossibleTag(false);

  bool handlePossibleEndTag() {
    data.next();
    return handlePossibleTag(true);
  }

  bool handlePossibleTag(bool endTag) {
    if (!isLetter(data.currentByte)) {
      //If the next byte is not an ascii letter either ignore this
      //fragment (possible start tag case) or treat it according to
      //handleOther
      if (endTag) {
        data.previous();
        handleOther();
      }
      return true;
    }

    var c = data.skipUntil(isSpaceOrAngleBracket);
    if (c == "<") {
      // return to the first step in the overall "two step" algorithm
      // reprocessing the < byte
      data.previous();
    } else {
      //Read all attributes
      var attr = getAttribute();
      while (attr != null) {
        attr = getAttribute();
      }
    }
    return true;
  }

  bool handleOther() => data.jumpTo(">");

  /**
   * Return a name,value pair for the next attribute in the stream,
   * if one is found, or null
   */
  List<String> getAttribute() {
    // Step 1 (skip chars)
    var c = data.skip((x) => x == "/" || isWhitespace(x));
    // Step 2
    if (c == ">" || c == null) {
      return null;
    }
    // Step 3
    var attrName = [];
    var attrValue = [];
    // Step 4 attribute name
    while (true) {
      if (c == null) {
        return null;
      } else if (c == "=" && attrName.length > 0) {
        break;
      } else if (isWhitespace(c)) {
        // Step 6!
        c = data.skip();
        c = data.next();
        break;
      } else if (c == "/" || c == ">") {
        return [joinStr(attrName), ""];
      } else if (isLetter(c)) {
        attrName.add(c.toLowerCase());
      } else {
        attrName.add(c);
      }
      // Step 5
      c = data.next();
    }
    // Step 7
    if (c != "=") {
      data.previous();
      return [joinStr(attrName), ""];
    }
    // Step 8
    data.next();
    // Step 9
    c = data.skip();
    // Step 10
    if (c == "'" || c == '"') {
      // 10.1
      var quoteChar = c;
      while (true) {
        // 10.2
        c = data.next();
        if (c == quoteChar) {
          // 10.3
          data.next();
          return [joinStr(attrName), joinStr(attrValue)];
        } else if (isLetter(c)) {
          // 10.4
          attrValue.add(c.toLowerCase());
        } else {
          // 10.5
          attrValue.add(c);
        }
      }
    } else if (c == ">") {
      return [joinStr(attrName), ""];
    } else if (c === null) {
      return null;
    } else if (isLetter(c)) {
      attrValue.add(c.toLowerCase());
    } else {
      attrValue.add(c);
    }
    // Step 11
    while (true) {
      c = data.next();
      if (isSpaceOrAngleBracket(c)) {
        return [joinStr(attrName), joinStr(attrValue)];
      } else if (c === null) {
        return null;
      } else if (isLetter(c)) {
        attrValue.add(c.toLowerCase());
      } else {
        attrValue.add(c);
      }
    }
  }
}


class ContentAttrParser {
  final EncodingBytes data;

  ContentAttrParser(this.data);

  String parse() {
    try {
      // Check if the attr name is charset
      // otherwise return
      data.jumpTo("charset");
      data.position += 1;
      data.skip();
      if (data.currentByte != "=") {
        // If there is no = sign keep looking for attrs
        return null;
      }
      data.position += 1;
      data.skip();
      // Look for an encoding between matching quote marks
      if (data.currentByte == '"' || data.currentByte == "'") {
        var quoteMark = data.currentByte;
        data.position += 1;
        var oldPosition = data.position;
        if (data.jumpTo(quoteMark)) {
          return data.slice(oldPosition, data.position);
        } else {
          return null;
        }
      } else {
        // Unquoted value
        var oldPosition = data.position;
        try {
          data.skipUntil(isWhitespace);
          return data.slice(oldPosition, data.position);
        } catch (NoMoreElementsException e) {
          //Return the whole remaining value
          return data.slice(oldPosition);
        }
      }
    } catch (NoMoreElementsException e) {
      return null;
    }
  }
}


bool isSpaceOrAngleBracket(String char) {
  return char == ">" || char == "<" || isWhitespace(char);
}

typedef bool CharPreciate(String char);