first commit

2023-08-01 13:49:46 +02:00
commit 1fc239fd54
20238 changed files with 3112246 additions and 0 deletions
@@ -0,0 +1,80 @@
+'use strict';
+
+exports.assign = function (tokenizer) {
+    //NOTE: obtain Tokenizer proto this way to avoid module circular references
+    var tokenizerProto = Object.getPrototypeOf(tokenizer);
+
+    tokenizer.tokenStartLoc = -1;
+
+    //NOTE: add location info builder method
+    tokenizer._attachLocationInfo = function (token) {
+        token.location = {
+            start: this.tokenStartLoc,
+            end: -1
+        };
+    };
+
+    //NOTE: patch token creation methods and attach location objects
+    tokenizer._createStartTagToken = function (tagNameFirstCh) {
+        tokenizerProto._createStartTagToken.call(this, tagNameFirstCh);
+        this._attachLocationInfo(this.currentToken);
+    };
+
+    tokenizer._createEndTagToken = function (tagNameFirstCh) {
+        tokenizerProto._createEndTagToken.call(this, tagNameFirstCh);
+        this._attachLocationInfo(this.currentToken);
+    };
+
+    tokenizer._createCommentToken = function () {
+        tokenizerProto._createCommentToken.call(this);
+        this._attachLocationInfo(this.currentToken);
+    };
+
+    tokenizer._createDoctypeToken = function (doctypeNameFirstCh) {
+        tokenizerProto._createDoctypeToken.call(this, doctypeNameFirstCh);
+        this._attachLocationInfo(this.currentToken);
+    };
+
+    tokenizer._createCharacterToken = function (type, ch) {
+        tokenizerProto._createCharacterToken.call(this, type, ch);
+        this._attachLocationInfo(this.currentCharacterToken);
+    };
+
+    //NOTE: patch token emission methods to determine end location
+    tokenizer._emitCurrentToken = function () {
+        //NOTE: if we have pending character token make it's end location equal to the
+        //current token's start location.
+        if (this.currentCharacterToken)
+            this.currentCharacterToken.location.end = this.currentToken.location.start;
+
+        this.currentToken.location.end = this.preprocessor.pos + 1;
+        tokenizerProto._emitCurrentToken.call(this);
+    };
+
+    tokenizer._emitCurrentCharacterToken = function () {
+        //NOTE: if we have character token and it's location wasn't set in the _emitCurrentToken(),
+        //then set it's location at the current preprocessor position
+        if (this.currentCharacterToken && this.currentCharacterToken.location.end === -1) {
+            //NOTE: we don't need to increment preprocessor position, since character token
+            //emission is always forced by the start of the next character token here.
+            //So, we already have advanced position.
+            this.currentCharacterToken.location.end = this.preprocessor.pos;
+        }
+
+        tokenizerProto._emitCurrentCharacterToken.call(this);
+    };
+
+    //NOTE: patch initial states for each mode to obtain token start position
+    Object.keys(tokenizerProto.MODE)
+
+        .map(function (modeName) {
+            return tokenizerProto.MODE[modeName];
+        })
+
+        .forEach(function (state) {
+            tokenizer[state] = function (cp) {
+                this.tokenStartLoc = this.preprocessor.pos;
+                tokenizerProto[state].call(this, cp);
+            };
+        });
+};
@@ -0,0 +1,115 @@
+'use strict';
+
+var UNICODE = require('../common/unicode');
+
+//Aliases
+var $ = UNICODE.CODE_POINTS;
+
+//Utils
+
+//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
+//this functions if they will be situated in another module due to context switch.
+//Always perform inlining check before modifying this functions ('node --trace-inlining').
+function isReservedCodePoint(cp) {
+    return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF;
+}
+
+function isSurrogatePair(cp1, cp2) {
+    return cp1 >= 0xD800 && cp1 <= 0xDBFF && cp2 >= 0xDC00 && cp2 <= 0xDFFF;
+}
+
+function getSurrogatePairCodePoint(cp1, cp2) {
+    return (cp1 - 0xD800) * 0x400 + 0x2400 + cp2;
+}
+
+//Preprocessor
+//NOTE: HTML input preprocessing
+//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
+var Preprocessor = module.exports = function (html) {
+    this.write(html);
+
+    //NOTE: one leading U+FEFF BYTE ORDER MARK character must be ignored if any are present in the input stream.
+    this.pos = this.html.charCodeAt(0) === $.BOM ? 0 : -1;
+
+    this.gapStack = [];
+    this.lastGapPos = -1;
+    this.skipNextNewLine = false;
+};
+
+Preprocessor.prototype.write = function (html) {
+    if (this.html) {
+        this.html = this.html.substring(0, this.pos + 1) +
+                    html +
+                    this.html.substring(this.pos + 1, this.html.length);
+
+    }
+    else
+        this.html = html;
+
+
+    this.lastCharPos = this.html.length - 1;
+};
+
+Preprocessor.prototype.advanceAndPeekCodePoint = function () {
+    this.pos++;
+
+    if (this.pos > this.lastCharPos)
+        return $.EOF;
+
+    var cp = this.html.charCodeAt(this.pos);
+
+    //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
+    //must be ignored.
+    if (this.skipNextNewLine && cp === $.LINE_FEED) {
+        this.skipNextNewLine = false;
+        this._addGap();
+        return this.advanceAndPeekCodePoint();
+    }
+
+    //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
+    if (cp === $.CARRIAGE_RETURN) {
+        this.skipNextNewLine = true;
+        return $.LINE_FEED;
+    }
+
+    this.skipNextNewLine = false;
+
+    //OPTIMIZATION: first perform check if the code point in the allowed range that covers most common
+    //HTML input (e.g. ASCII codes) to avoid performance-cost operations for high-range code points.
+    return cp >= 0xD800 ? this._processHighRangeCodePoint(cp) : cp;
+};
+
+Preprocessor.prototype._processHighRangeCodePoint = function (cp) {
+    //NOTE: try to peek a surrogate pair
+    if (this.pos !== this.lastCharPos) {
+        var nextCp = this.html.charCodeAt(this.pos + 1);
+
+        if (isSurrogatePair(cp, nextCp)) {
+            //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
+            this.pos++;
+            cp = getSurrogatePairCodePoint(cp, nextCp);
+
+            //NOTE: add gap that should be avoided during retreat
+            this._addGap();
+        }
+    }
+
+    if (isReservedCodePoint(cp))
+        cp = $.REPLACEMENT_CHARACTER;
+
+    return cp;
+};
+
+Preprocessor.prototype._addGap = function () {
+    this.gapStack.push(this.lastGapPos);
+    this.lastGapPos = this.pos;
+};
+
+Preprocessor.prototype.retreat = function () {
+    if (this.pos === this.lastGapPos) {
+        this.lastGapPos = this.gapStack.pop();
+        this.pos--;
+    }
+
+    this.pos--;
+};