first commit

This commit is contained in:
s.golasch
2023-08-01 13:49:46 +02:00
commit 1fc239fd54
20238 changed files with 3112246 additions and 0 deletions

View File

@@ -0,0 +1,107 @@
'use strict';
var Tokenizer = require('../tokenization/tokenizer'),
TokenizerProxy = require('./tokenizer_proxy'),
Utils = require('../common/utils');
//Default options
var DEFAULT_OPTIONS = {
decodeHtmlEntities: true,
locationInfo: false
};
//Skipping handler
function skip() {
//NOTE: do nothing =)
}
//SimpleApiParser
var SimpleApiParser = module.exports = function (handlers, options) {
this.options = Utils.mergeOptions(DEFAULT_OPTIONS, options);
this.handlers = {
doctype: this._wrapHandler(handlers.doctype),
startTag: this._wrapHandler(handlers.startTag),
endTag: this._wrapHandler(handlers.endTag),
text: this._wrapHandler(handlers.text),
comment: this._wrapHandler(handlers.comment)
};
};
SimpleApiParser.prototype._wrapHandler = function (handler) {
var parser = this;
handler = handler || skip;
if (this.options.locationInfo) {
return function () {
var args = Array.prototype.slice.call(arguments);
args.push(parser.currentTokenLocation);
handler.apply(handler, args);
};
}
return handler;
};
//API
SimpleApiParser.prototype.parse = function (html) {
var token = null;
this._reset(html);
do {
token = this.tokenizerProxy.getNextToken();
if (token.type === Tokenizer.CHARACTER_TOKEN ||
token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN ||
token.type === Tokenizer.NULL_CHARACTER_TOKEN) {
if (this.options.locationInfo) {
if (this.pendingText === null)
this.currentTokenLocation = token.location;
else
this.currentTokenLocation.end = token.location.end;
}
this.pendingText = (this.pendingText || '') + token.chars;
}
else {
this._emitPendingText();
this._handleToken(token);
}
} while (token.type !== Tokenizer.EOF_TOKEN);
};
//Internals
SimpleApiParser.prototype._handleToken = function (token) {
if (this.options.locationInfo)
this.currentTokenLocation = token.location;
if (token.type === Tokenizer.START_TAG_TOKEN)
this.handlers.startTag(token.tagName, token.attrs, token.selfClosing);
else if (token.type === Tokenizer.END_TAG_TOKEN)
this.handlers.endTag(token.tagName);
else if (token.type === Tokenizer.COMMENT_TOKEN)
this.handlers.comment(token.data);
else if (token.type === Tokenizer.DOCTYPE_TOKEN)
this.handlers.doctype(token.name, token.publicId, token.systemId);
};
SimpleApiParser.prototype._reset = function (html) {
this.tokenizerProxy = new TokenizerProxy(html, this.options);
this.pendingText = null;
this.currentTokenLocation = null;
};
SimpleApiParser.prototype._emitPendingText = function () {
if (this.pendingText !== null) {
this.handlers.text(this.pendingText);
this.pendingText = null;
}
};

View File

@@ -0,0 +1,122 @@
'use strict';
var Tokenizer = require('../tokenization/tokenizer'),
ForeignContent = require('../common/foreign_content'),
UNICODE = require('../common/unicode'),
HTML = require('../common/html');
//Aliases
var $ = HTML.TAG_NAMES,
NS = HTML.NAMESPACES;
//Tokenizer proxy
//NOTE: this proxy simulates adjustment of the Tokenizer which performed by standard parser during tree construction.
var TokenizerProxy = module.exports = function (html, options) {
this.tokenizer = new Tokenizer(html, options);
this.namespaceStack = [];
this.namespaceStackTop = -1;
this.currentNamespace = null;
this.inForeignContent = false;
};
//API
TokenizerProxy.prototype.getNextToken = function () {
var token = this.tokenizer.getNextToken();
if (token.type === Tokenizer.START_TAG_TOKEN)
this._handleStartTagToken(token);
else if (token.type === Tokenizer.END_TAG_TOKEN)
this._handleEndTagToken(token);
else if (token.type === Tokenizer.NULL_CHARACTER_TOKEN && this.inForeignContent) {
token.type = Tokenizer.CHARACTER_TOKEN;
token.chars = UNICODE.REPLACEMENT_CHARACTER;
}
return token;
};
//Namespace stack mutations
TokenizerProxy.prototype._enterNamespace = function (namespace) {
this.namespaceStackTop++;
this.namespaceStack.push(namespace);
this.inForeignContent = namespace !== NS.HTML;
this.currentNamespace = namespace;
this.tokenizer.allowCDATA = this.inForeignContent;
};
TokenizerProxy.prototype._leaveCurrentNamespace = function () {
this.namespaceStackTop--;
this.namespaceStack.pop();
this.currentNamespace = this.namespaceStack[this.namespaceStackTop];
this.inForeignContent = this.currentNamespace !== NS.HTML;
this.tokenizer.allowCDATA = this.inForeignContent;
};
//Token handlers
TokenizerProxy.prototype._ensureTokenizerMode = function (tn) {
if (tn === $.TEXTAREA || tn === $.TITLE)
this.tokenizer.state = Tokenizer.MODE.RCDATA;
else if (tn === $.PLAINTEXT)
this.tokenizer.state = Tokenizer.MODE.PLAINTEXT;
else if (tn === $.SCRIPT)
this.tokenizer.state = Tokenizer.MODE.SCRIPT_DATA;
else if (tn === $.STYLE || tn === $.IFRAME || tn === $.XMP ||
tn === $.NOEMBED || tn === $.NOFRAMES || tn === $.NOSCRIPT) {
this.tokenizer.state = Tokenizer.MODE.RAWTEXT;
}
};
TokenizerProxy.prototype._handleStartTagToken = function (token) {
var tn = token.tagName;
if (tn === $.SVG)
this._enterNamespace(NS.SVG);
else if (tn === $.MATH)
this._enterNamespace(NS.MATHML);
else {
if (this.inForeignContent) {
if (ForeignContent.causesExit(token))
this._leaveCurrentNamespace();
else if (ForeignContent.isMathMLTextIntegrationPoint(tn, this.currentNamespace) ||
ForeignContent.isHtmlIntegrationPoint(tn, this.currentNamespace, token.attrs)) {
this._enterNamespace(NS.HTML);
}
}
else
this._ensureTokenizerMode(tn);
}
};
TokenizerProxy.prototype._handleEndTagToken = function (token) {
var tn = token.tagName;
if (!this.inForeignContent) {
var previousNs = this.namespaceStack[this.namespaceStackTop - 1];
//NOTE: check for exit from integration point
if (ForeignContent.isMathMLTextIntegrationPoint(tn, previousNs) ||
ForeignContent.isHtmlIntegrationPoint(tn, previousNs, token.attrs)) {
this._leaveCurrentNamespace();
}
else if (tn === $.SCRIPT)
this.tokenizer.state = Tokenizer.MODE.DATA;
}
else if ((tn === $.SVG && this.currentNamespace === NS.SVG) ||
(tn === $.MATH && this.currentNamespace === NS.MATHML))
this._leaveCurrentNamespace();
};