sanitizer.js

var html4 = require("./lib/html4.js");
var URI = require("./lib/uri.js");

// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/**
 * @fileoverview
 * An HTML sanitizer that can satisfy a variety of security policies.
 *
 * <p>
 * The HTML sanitizer is built around a SAX parser and HTML element and
 * attributes schemas.
 *
 * If the cssparser is loaded, inline styles are sanitized using the
 * css property and value schemas.  Else they are remove during
 * sanitization.
 *
 * If it exists, uses parseCssDeclarations, sanitizeCssProperty,  cssSchema
 *
 * @author mikesamuel@gmail.com
 * @author jasvir@gmail.com
 * \@requires html4, URI
 * \@overrides window
 * \@provides html, html_sanitize
 */

// The Turkish i seems to be a non-issue, but abort in case it is.
if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; }

/**
 * \@namespace
 */
var html = (function(html4) {

    // For closure compiler
    var parseCssDeclarations, sanitizeCssProperty, cssSchema;
    if ('undefined' !== typeof window) {
        parseCssDeclarations = window['parseCssDeclarations'];
        sanitizeCssProperty = window['sanitizeCssProperty'];
        cssSchema = window['cssSchema'];
    }

    // The keys of this object must be 'quoted' or JSCompiler will mangle them!
    // This is a partial list -- lookupEntity() uses the host browser's parser
    // (when available) to implement full entity lookup.
    // Note that entities are in general case-sensitive; the uppercase ones are
    // explicitly defined by HTML5 (presumably as compatibility).
    var ENTITIES = {
        'lt': '<',
        'LT': '<',
        'gt': '>',
        'GT': '>',
        'amp': '&',
        'AMP': '&',
        'quot': '"',
        'apos': '\'',
        'nbsp': '\u00a0'
    };

    // Patterns for types of entity/character reference names.
    var decimalEscapeRe = /^#(\d+)$/;
    var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
    // contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
    var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/;
    // Used as a hook to invoke the browser's entity parsing. <textarea> is used
    // because its content is parsed for entities but not tags.
    // TODO(kpreid): This retrieval is a kludge and leads to silent loss of
    // functionality if the document isn't available.
    var entityLookupElement =
        ('undefined' !== typeof window && window['document'])
            ? window['document'].createElement('textarea') : null;
    /**
     * Decodes an HTML entity.
     *
     * {\@updoc
     * $ lookupEntity('lt')
     * # '<'
     * $ lookupEntity('GT')
     * # '>'
     * $ lookupEntity('amp')
     * # '&'
     * $ lookupEntity('nbsp')
     * # '\xA0'
     * $ lookupEntity('apos')
     * # "'"
     * $ lookupEntity('quot')
     * # '"'
     * $ lookupEntity('#xa')
     * # '\n'
     * $ lookupEntity('#10')
     * # '\n'
     * $ lookupEntity('#x0a')
     * # '\n'
     * $ lookupEntity('#010')
     * # '\n'
     * $ lookupEntity('#x00A')
     * # '\n'
     * $ lookupEntity('Pi')      // Known failure
     * # '\u03A0'
     * $ lookupEntity('pi')      // Known failure
     * # '\u03C0'
     * }
     *
     * @param {string} name the content between the '&' and the ';'.
     * @return {string} a single unicode code-point as a string.
     */
    function lookupEntity(name) {
        // TODO: entity lookup as specified by HTML5 actually depends on the
        // presence of the ";".
        if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
        var m = name.match(decimalEscapeRe);
        if (m) {
            return String.fromCharCode(parseInt(m[1], 10));
        } else if (!!(m = name.match(hexEscapeRe))) {
            return String.fromCharCode(parseInt(m[1], 16));
        } else if (entityLookupElement && safeEntityNameRe.test(name)) {
            entityLookupElement.innerHTML = '&' + name + ';';
            var text = entityLookupElement.textContent;
            ENTITIES[name] = text;
            return text;
        } else {
            return '&' + name + ';';
        }
    }

    function decodeOneEntity(_, name) {
        return lookupEntity(name);
    }

    var nulRe = /\0/g;
    function stripNULs(s) {
        return s.replace(nulRe, '');
    }

    var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g;
    var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/;
    /**
     * The plain text of a chunk of HTML CDATA which possibly containing.
     *
     * {\@updoc
     * $ unescapeEntities('')
     * # ''
     * $ unescapeEntities('hello World!')
     * # 'hello World!'
     * $ unescapeEntities('1 &lt; 2 &amp;&AMP; 4 &gt; 3&#10;')
     * # '1 < 2 && 4 > 3\n'
     * $ unescapeEntities('&lt;&lt <- unfinished entity&gt;')
     * # '<&lt <- unfinished entity>'
     * $ unescapeEntities('/foo?bar=baz&copy=true')  // & often unescaped in URLS
     * # '/foo?bar=baz&copy=true'
     * $ unescapeEntities('pi=&pi;&#x3c0;, Pi=&Pi;\u03A0') // FIXME: known failure
     * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
     * }
     *
     * @param {string} s a chunk of HTML CDATA.  It must not start or end inside
     *     an HTML entity.
     */
    function unescapeEntities(s) {
	if(s) {
	    return s.replace(ENTITY_RE_1, decodeOneEntity);
	}
	else {
	    return s;
	}
    }

    var ampRe = /&/g;
    var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
    var ltRe = /[<]/g;
    var gtRe = />/g;
    var quotRe = /\"/g;

    /**
     * Escapes HTML special characters in attribute values.
     *
     * {\@updoc
     * $ escapeAttrib('')
     * # ''
     * $ escapeAttrib('"<<&==&>>"')  // Do not just escape the first occurrence.
     * # '&#34;&lt;&lt;&amp;&#61;&#61;&amp;&gt;&gt;&#34;'
     * $ escapeAttrib('Hello <World>!')
     * # 'Hello &lt;World&gt;!'
     * }
     */
    function escapeAttrib(s) {
	if(s) {
	    return ('' + s).replace(ampRe, '&amp;').replace(ltRe, '&lt;')
            .replace(gtRe, '&gt;').replace(quotRe, '&#34;');
	}
	else {
	    return s;
	}
        
    }

    /**
     * Escape entities in RCDATA that can be escaped without changing the meaning.
     * {\@updoc
     * $ normalizeRCData('1 < 2 &&amp; 3 > 4 &amp;& 5 &lt; 7&8')
     * # '1 &lt; 2 &amp;&amp; 3 &gt; 4 &amp;&amp; 5 &lt; 7&amp;8'
     * }
     */
    function normalizeRCData(rcdata) {
	if(rcdata) {
	    return rcdata
                .replace(looseAmpRe, '&amp;$1')
                .replace(ltRe, '&lt;')
                .replace(gtRe, '&gt;');
	}
	else {
	    return rcdata;
	}
    }

    // TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html

    // We initially split input so that potentially meaningful characters
    // like '<' and '>' are separate tokens, using a fast dumb process that
    // ignores quoting.  Then we walk that token stream, and when we see a
    // '<' that's the start of a tag, we use ATTR_RE to extract tag
    // attributes from the next token.  That token will never have a '>'
    // character.  However, it might have an unbalanced quote character, and
    // when we see that, we combine additional tokens to balance the quote.

    var ATTR_RE = new RegExp(
        '^\\s*' +
            '([-.:\\w]+)' +             // 1 = Attribute name
            '(?:' + (
            '\\s*(=)\\s*' +           // 2 = Is there a value?
                '(' + (                   // 3 = Attribute value
                // TODO(felix8a): maybe use backref to match quotes
                '(\")[^\"]*(\"|$)' +    // 4, 5 = Double-quoted string
                    '|' +
                    '(\')[^\']*(\'|$)' +    // 6, 7 = Single-quoted string
                    '|' +
                    // Positive lookahead to prevent interpretation of
                    // <foo a= b=c> as <foo a='b=c'>
                    // TODO(felix8a): might be able to drop this case
                    '(?=[a-z][-\\w]*\\s*=)' +
                    '|' +
                    // Unquoted value that isn't an attribute name
                    // (since we didn't match the positive lookahead above)
                    '[^\"\'\\s]*' ) +
                ')' ) +
            ')?',
        'i');

    // false on IE<=8, true on most other browsers
    var splitWillCapture = ('a,b'.split(/(,)/).length === 3);

    // bitmask for tags with special parsing, like <script> and <textarea>
    var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA'];

    /**
     * Given a SAX-like event handler, produce a function that feeds those
     * events and a parameter to the event handler.
     *
     * The event handler has the form:{@code
     * {
   *   // Name is an upper-case HTML tag name.  Attribs is an array of
   *   // alternating upper-case attribute names, and attribute values.  The
   *   // attribs array is reused by the parser.  Param is the value passed to
   *   // the saxParser.
   *   startTag: function (name, attribs, param) { ... },
   *   endTag:   function (name, param) { ... },
   *   pcdata:   function (text, param) { ... },
   *   rcdata:   function (text, param) { ... },
   *   cdata:    function (text, param) { ... },
   *   startDoc: function (param) { ... },
   *   endDoc:   function (param) { ... }
   * }}
     *
     * @param {Object} handler a record containing event handlers.
     * @return {function(string, Object)} A function that takes a chunk of HTML
     *     and a parameter.  The parameter is passed on to the handler methods.
     */
    function makeSaxParser(handler) {
        // Accept quoted or unquoted keys (Closure compat)
        var hcopy = {
            cdata: handler.cdata || handler['cdata'],
            comment: handler.comment || handler['comment'],
            endDoc: handler.endDoc || handler['endDoc'],
            endTag: handler.endTag || handler['endTag'],
            pcdata: handler.pcdata || handler['pcdata'],
            rcdata: handler.rcdata || handler['rcdata'],
            startDoc: handler.startDoc || handler['startDoc'],
            startTag: handler.startTag || handler['startTag']
        };
        return function(htmlText, param) {
            return parse(htmlText, hcopy, param);
        };
    }

    // Parsing strategy is to split input into parts that might be lexically
    // meaningful (every ">" becomes a separate part), and then recombine
    // parts if we discover they're in a different context.

    // TODO(felix8a): Significant performance regressions from -legacy,
    // tested on
    //    Chrome 18.0
    //    Firefox 11.0
    //    IE 6, 7, 8, 9
    //    Opera 11.61
    //    Safari 5.1.3
    // Many of these are unusual patterns that are linearly slower and still
    // pretty fast (eg 1ms to 5ms), so not necessarily worth fixing.

    // TODO(felix8a): "<script> && && && ... <\/script>" is slower on all
    // browsers.  The hotspot is htmlSplit.

    // TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers.
    // This is partly htmlSplit, but the hotspot is parseTagAndAttrs.

    // TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9.
    // "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster.

    // TODO(felix8a): "<p<p<p..." is slower on IE[6-8]

    var continuationMarker = {};
    function parse(htmlText, handler, param) {
        var m, p, tagName;
        var parts = htmlSplit(htmlText);
        var state = {
            noMoreGT: false,
            noMoreEndComments: false
        };
        parseCPS(handler, parts, 0, state, param);
    }

    function continuationMaker(h, parts, initial, state, param) {
        return function () {
            parseCPS(h, parts, initial, state, param);
        };
    }

    function parseCPS(h, parts, initial, state, param) {
        try {
            if (h.startDoc && initial == 0) { h.startDoc(param); }
            var m, p, tagName;
            for (var pos = initial, end = parts.length; pos < end;) {
                var current = parts[pos++];
                var next = parts[pos];
                switch (current) {
                    case '&':
                        if (ENTITY_RE_2.test(next)) {
                            if (h.pcdata) {
                                h.pcdata('&' + next, param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                            pos++;
                        } else {
                            if (h.pcdata) { h.pcdata("&amp;", param, continuationMarker,
                                continuationMaker(h, parts, pos, state, param));
                            }
                        }
                        break;
                    case '<\/':
                        if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) {
                            if (m[0].length === next.length && parts[pos + 1] === '>') {
                                // fast case, no attribute parsing needed
                                pos += 2;
                                tagName = m[1].toLowerCase();
                                if (h.endTag) {
                                    h.endTag(tagName, param, continuationMarker,
                                        continuationMaker(h, parts, pos, state, param));
                                }
                            } else {
                                // slow case, need to parse attributes
                                // TODO(felix8a): do we really care about misparsing this?
                                pos = parseEndTag(
                                    parts, pos, h, param, continuationMarker, state);
                            }
                        } else {
                            if (h.pcdata) {
                                h.pcdata('&lt;/', param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                        }
                        break;
                    case '<':
                        if (m = /^([-\w:]+)\s*\/?/.exec(next)) {
                            if (m[0].length === next.length && parts[pos + 1] === '>') {
                                // fast case, no attribute parsing needed
                                pos += 2;
                                tagName = m[1].toLowerCase();
                                if (h.startTag) {
                                    h.startTag(tagName, [], param, continuationMarker,
                                        continuationMaker(h, parts, pos, state, param));
                                }
                                // tags like <script> and <textarea> have special parsing
                                var eflags = html4.ELEMENTS[tagName];
                                if (eflags & EFLAGS_TEXT) {
                                    var tag = { name: tagName, next: pos, eflags: eflags };
                                    pos = parseText(
                                        parts, tag, h, param, continuationMarker, state);
                                }
                            } else {
                                // slow case, need to parse attributes
                                pos = parseStartTag(
                                    parts, pos, h, param, continuationMarker, state);
                            }
                        } else {
                            if (h.pcdata) {
                                h.pcdata('&lt;', param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                        }
                        break;
                    case '<\!--':
                        // The pathological case is n copies of '<\!--' without '-->', and
                        // repeated failure to find '-->' is quadratic.  We avoid that by
                        // remembering when search for '-->' fails.
                        if (!state.noMoreEndComments) {
                            // A comment <\!--x--> is split into three tokens:
                            //   '<\!--', 'x--', '>'
                            // We want to find the next '>' token that has a preceding '--'.
                            // pos is at the 'x--'.
                            for (p = pos + 1; p < end; p++) {
                                if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; }
                            }
                            if (p < end) {
                                if (h.comment) {
                                    var comment = parts.slice(pos, p).join('');
                                    h.comment(
                                        comment.substr(0, comment.length - 2), param,
                                        continuationMarker,
                                        continuationMaker(h, parts, p + 1, state, param));
                                }
                                pos = p + 1;
                            } else {
                                state.noMoreEndComments = true;
                            }
                        }
                        if (state.noMoreEndComments) {
                            if (h.pcdata) {
                                h.pcdata('&lt;!--', param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                        }
                        break;
                    case '<\!':
                        if (!/^\w/.test(next)) {
                            if (h.pcdata) {
                                h.pcdata('&lt;!', param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                        } else {
                            // similar to noMoreEndComment logic
                            if (!state.noMoreGT) {
                                for (p = pos + 1; p < end; p++) {
                                    if (parts[p] === '>') { break; }
                                }
                                if (p < end) {
                                    pos = p + 1;
                                } else {
                                    state.noMoreGT = true;
                                }
                            }
                            if (state.noMoreGT) {
                                if (h.pcdata) {
                                    h.pcdata('&lt;!', param, continuationMarker,
                                        continuationMaker(h, parts, pos, state, param));
                                }
                            }
                        }
                        break;
                    case '<?':
                        // similar to noMoreEndComment logic
                        if (!state.noMoreGT) {
                            for (p = pos + 1; p < end; p++) {
                                if (parts[p] === '>') { break; }
                            }
                            if (p < end) {
                                pos = p + 1;
                            } else {
                                state.noMoreGT = true;
                            }
                        }
                        if (state.noMoreGT) {
                            if (h.pcdata) {
                                h.pcdata('&lt;?', param, continuationMarker,
                                    continuationMaker(h, parts, pos, state, param));
                            }
                        }
                        break;
                    case '>':
                        if (h.pcdata) {
                            h.pcdata("&gt;", param, continuationMarker,
                                continuationMaker(h, parts, pos, state, param));
                        }
                        break;
                    case '':
                        break;
                    default:
                        if (h.pcdata) {
                            h.pcdata(current, param, continuationMarker,
                                continuationMaker(h, parts, pos, state, param));
                        }
                        break;
                }
            }
            if (h.endDoc) { h.endDoc(param); }
        } catch (e) {
            if (e !== continuationMarker) { throw e; }
        }
    }

    // Split str into parts for the html parser.
    function htmlSplit(str) {
        // can't hoist this out of the function because of the re.exec loop.
        var re = /(<\/|<\!--|<[!?]|[&<>])/g;
        str += '';
        if (splitWillCapture) {
            return str.split(re);
        } else {
            var parts = [];
            var lastPos = 0;
            var m;
            while ((m = re.exec(str)) !== null) {
                parts.push(str.substring(lastPos, m.index));
                parts.push(m[0]);
                lastPos = m.index + m[0].length;
            }
            parts.push(str.substring(lastPos));
            return parts;
        }
    }

    function parseEndTag(parts, pos, h, param, continuationMarker, state) {
        var tag = parseTagAndAttrs(parts, pos);
        // drop unclosed tags
        if (!tag) { return parts.length; }
        if (h.endTag) {
            h.endTag(tag.name, param, continuationMarker,
                continuationMaker(h, parts, pos, state, param));
        }
        return tag.next;
    }

    function parseStartTag(parts, pos, h, param, continuationMarker, state) {
        var tag = parseTagAndAttrs(parts, pos);
        // drop unclosed tags
        if (!tag) { return parts.length; }
        if (h.startTag) {
            h.startTag(tag.name, tag.attrs, param, continuationMarker,
                continuationMaker(h, parts, tag.next, state, param));
        }
        // tags like <script> and <textarea> have special parsing
        if (tag.eflags & EFLAGS_TEXT) {
            return parseText(parts, tag, h, param, continuationMarker, state);
        } else {
            return tag.next;
        }
    }

    var endTagRe = {};

    // Tags like <script> and <textarea> are flagged as CDATA or RCDATA,
    // which means everything is text until we see the correct closing tag.
    function parseText(parts, tag, h, param, continuationMarker, state) {
        var end = parts.length;
        if (!endTagRe.hasOwnProperty(tag.name)) {
            endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i');
        }
        var re = endTagRe[tag.name];
        var first = tag.next;
        var p = tag.next + 1;
        for (; p < end; p++) {
            if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; }
        }
        if (p < end) { p -= 1; }
        var buf = parts.slice(first, p).join('');
        if (tag.eflags & html4.eflags['CDATA']) {
            if (h.cdata) {
                h.cdata(buf, param, continuationMarker,
                    continuationMaker(h, parts, p, state, param));
            }
        } else if (tag.eflags & html4.eflags['RCDATA']) {
            if (h.rcdata) {
                h.rcdata(normalizeRCData(buf), param, continuationMarker,
                    continuationMaker(h, parts, p, state, param));
            }
        } else {
            throw new Error('bug');
        }
        return p;
    }

    // at this point, parts[pos-1] is either "<" or "<\/".
    function parseTagAndAttrs(parts, pos) {
        var m = /^([-\w:]+)/.exec(parts[pos]);
        var tag = {};
        tag.name = m[1].toLowerCase();
        tag.eflags = html4.ELEMENTS[tag.name];
        var buf = parts[pos].substr(m[0].length);
        // Find the next '>'.  We optimistically assume this '>' is not in a
        // quoted context, and further down we fix things up if it turns out to
        // be quoted.
        var p = pos + 1;
        var end = parts.length;
        for (; p < end; p++) {
            if (parts[p] === '>') { break; }
            buf += parts[p];
        }
        if (end <= p) { return void 0; }
        var attrs = [];
        while (buf !== '') {
            m = ATTR_RE.exec(buf);
            if (!m) {
                // No attribute found: skip garbage
                buf = buf.replace(/^[\s\S][^a-z\s]*/, '');

            } else if ((m[4] && !m[5]) || (m[6] && !m[7])) {
                // Unterminated quote: slurp to the next unquoted '>'
                var quote = m[4] || m[6];
                var sawQuote = false;
                var abuf = [buf, parts[p++]];
                for (; p < end; p++) {
                    if (sawQuote) {
                        if (parts[p] === '>') { break; }
                    } else if (0 <= parts[p].indexOf(quote)) {
                        sawQuote = true;
                    }
                    abuf.push(parts[p]);
                }
                // Slurp failed: lose the garbage
                if (end <= p) { break; }
                // Otherwise retry attribute parsing
                buf = abuf.join('');
                continue;

            } else {
                // We have an attribute
                var aName = m[1].toLowerCase();
                var aValue = m[2] ? decodeValue(m[3]) : '';
                attrs.push(aName, aValue);
                buf = buf.substr(m[0].length);
            }
        }
        tag.attrs = attrs;
        tag.next = p + 1;
        return tag;
    }

    function decodeValue(v) {
        var q = v.charCodeAt(0);
        if (q === 0x22 || q === 0x27) { // " or '
            v = v.substr(1, v.length - 2);
        }
        return unescapeEntities(stripNULs(v));
    }

    /**
     * Returns a function that strips unsafe tags and attributes from html.
     * @param {function(string, Array.<string>): ?Array.<string>} tagPolicy
     *     A function that takes (tagName, attribs[]), where tagName is a key in
     *     html4.ELEMENTS and attribs is an array of alternating attribute names
     *     and values.  It should return a record (as follows), or null to delete
     *     the element.  It's okay for tagPolicy to modify the attribs array,
     *     but the same array is reused, so it should not be held between calls.
     *     Record keys:
     *        attribs: (required) Sanitized attributes array.
     *        tagName: Replacement tag name.
     * @return {function(string, Array)} A function that sanitizes a string of
     *     HTML and appends result strings to the second argument, an array.
     */
    function makeHtmlSanitizer(tagPolicy) {
        var stack;
        var ignoring;
        var emit = function (text, out) {
            if (!ignoring) { out.push(text); }
        };
        return makeSaxParser({
            'startDoc': function(_) {
                stack = [];
                ignoring = false;
            },
            'startTag': function(tagNameOrig, attribs, out) {
                if (ignoring) { return; }
                if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; }
                var eflagsOrig = html4.ELEMENTS[tagNameOrig];
                if (eflagsOrig & html4.eflags['FOLDABLE']) {
                    return;
                }

                var decision = tagPolicy(tagNameOrig, attribs);
                if (!decision) {
                    ignoring = !(eflagsOrig & html4.eflags['EMPTY']);
                    return;
                } else if (typeof decision !== 'object') {
                    throw new Error('tagPolicy did not return object (old API?)');
                }
                if ('attribs' in decision) {
                    attribs = decision['attribs'];
                } else {
                    throw new Error('tagPolicy gave no attribs');
                }
                var eflagsRep;
                var tagNameRep;
                if ('tagName' in decision) {
                    tagNameRep = decision['tagName'];
                    eflagsRep = html4.ELEMENTS[tagNameRep];
                } else {
                    tagNameRep = tagNameOrig;
                    eflagsRep = eflagsOrig;
                }
                // TODO(mikesamuel): relying on tagPolicy not to insert unsafe
                // attribute names.

                // If this is an optional-end-tag element and either this element or its
                // previous like sibling was rewritten, then insert a close tag to
                // preserve structure.
                if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) {
                    var onStack = stack[stack.length - 1];
                    if (onStack && onStack.orig === tagNameOrig &&
                        (onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) {
                        out.push('<\/', onStack.rep, '>');
                    }
                }

                if (!(eflagsOrig & html4.eflags['EMPTY'])) {
                    stack.push({orig: tagNameOrig, rep: tagNameRep});
                }

                out.push('<', tagNameRep);
                for (var i = 0, n = attribs.length; i < n; i += 2) {
                    var attribName = attribs[i],
                        value = attribs[i + 1];
                    if (value !== null && value !== void 0) {
                        out.push(' ', attribName, '="', escapeAttrib(value), '"');
                    }
                }
                out.push('>');

                if ((eflagsOrig & html4.eflags['EMPTY'])
                    && !(eflagsRep & html4.eflags['EMPTY'])) {
                    // replacement is non-empty, synthesize end tag
                    out.push('<\/', tagNameRep, '>');
                }
            },
            'endTag': function(tagName, out) {
                if (ignoring) {
                    ignoring = false;
                    return;
                }
                if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
                var eflags = html4.ELEMENTS[tagName];
                if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) {
                    var index;
                    if (eflags & html4.eflags['OPTIONAL_ENDTAG']) {
                        for (index = stack.length; --index >= 0;) {
                            var stackElOrigTag = stack[index].orig;
                            if (stackElOrigTag === tagName) { break; }
                            if (!(html4.ELEMENTS[stackElOrigTag] &
                                html4.eflags['OPTIONAL_ENDTAG'])) {
                                // Don't pop non optional end tags looking for a match.
                                return;
                            }
                        }
                    } else {
                        for (index = stack.length; --index >= 0;) {
                            if (stack[index].orig === tagName) { break; }
                        }
                    }
                    if (index < 0) { return; }  // Not opened.
                    for (var i = stack.length; --i > index;) {
                        var stackElRepTag = stack[i].rep;
                        if (!(html4.ELEMENTS[stackElRepTag] &
                            html4.eflags['OPTIONAL_ENDTAG'])) {
                            out.push('<\/', stackElRepTag, '>');
                        }
                    }
                    if (index < stack.length) {
                        tagName = stack[index].rep;
                    }
                    stack.length = index;
                    out.push('<\/', tagName, '>');
                }
            },
            'pcdata': emit,
            'rcdata': emit,
            'cdata': emit,
            'endDoc': function(out) {
                for (; stack.length; stack.length--) {
                    out.push('<\/', stack[stack.length - 1].rep, '>');
                }
            }
        });
    }

    var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i;

    function safeUri(uri, effect, ltype, hints, naiveUriRewriter) {
        if (!naiveUriRewriter) { return null; }
        try {
            var parsed = URI.parse('' + uri);
            if (parsed) {
                if (!parsed.hasScheme() ||
                    ALLOWED_URI_SCHEMES.test(parsed.getScheme())) {
                    var safe = naiveUriRewriter(parsed, effect, ltype, hints);
                    return safe ? safe.toString() : null;
                }
            }
        } catch (e) {
            return null;
        }
        return null;
    }

    function log(logger, tagName, attribName, oldValue, newValue) {
        if (!attribName) {
            logger(tagName + " removed", {
                change: "removed",
                tagName: tagName
            });
        }
        if (oldValue !== newValue) {
            var changed = "changed";
            if (oldValue && !newValue) {
                changed = "removed";
            } else if (!oldValue && newValue)  {
                changed = "added";
            }
            logger(tagName + "." + attribName + " " + changed, {
                change: changed,
                tagName: tagName,
                attribName: attribName,
                oldValue: oldValue,
                newValue: newValue
            });
        }
    }

    function lookupAttribute(map, tagName, attribName) {
        var attribKey;
        attribKey = tagName + '::' + attribName;
        if (map.hasOwnProperty(attribKey)) {
            return map[attribKey];
        }
        attribKey = '*::' + attribName;
        if (map.hasOwnProperty(attribKey)) {
            return map[attribKey];
        }
        return void 0;
    }
    function getAttributeType(tagName, attribName) {
        return lookupAttribute(html4.ATTRIBS, tagName, attribName);
    }
    function getLoaderType(tagName, attribName) {
        return lookupAttribute(html4.LOADERTYPES, tagName, attribName);
    }
    function getUriEffect(tagName, attribName) {
        return lookupAttribute(html4.URIEFFECTS, tagName, attribName);
    }

    /**
     * Sanitizes attributes on an HTML tag.
     * @param {string} tagName An HTML tag name in lowercase.
     * @param {Array.<?string>} attribs An array of alternating names and values.
     * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
     *     apply to URI attributes; it can return a new string value, or null to
     *     delete the attribute.  If unspecified, URI attributes are deleted.
     * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
     *     to attributes containing HTML names, element IDs, and space-separated
     *     lists of classes; it can return a new string value, or null to delete
     *     the attribute.  If unspecified, these attributes are kept unchanged.
     * @return {Array.<?string>} The sanitized attributes as a list of alternating
     *     names and values, where a null value means to omit the attribute.
     */
    function sanitizeAttribs(tagName, attribs,
        opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
        // TODO(felix8a): it's obnoxious that domado duplicates much of this
        // TODO(felix8a): maybe consistently enforce constraints like target=
        for (var i = 0; i < attribs.length; i += 2) {
            var attribName = attribs[i];
            var value = attribs[i + 1];
            var oldValue = value;
            var atype = null, attribKey;
            if ((attribKey = tagName + '::' + attribName,
                html4.ATTRIBS.hasOwnProperty(attribKey)) ||
                (attribKey = '*::' + attribName,
                    html4.ATTRIBS.hasOwnProperty(attribKey))) {
                atype = html4.ATTRIBS[attribKey];
            }
            if (atype !== null) {
                switch (atype) {
                    case html4.atype['NONE']: break;
                    case html4.atype['SCRIPT']:
                        value = null;
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                    case html4.atype['STYLE']:
                        if ('undefined' === typeof parseCssDeclarations) {
                            value = null;
                            if (opt_logger) {
                                log(opt_logger, tagName, attribName, oldValue, value);
                            }
                            break;
                        }
                        var sanitizedDeclarations = [];
                        parseCssDeclarations(
                            value,
                            {
                                'declaration': function (property, tokens) {
                                    var normProp = property.toLowerCase();
                                    sanitizeCssProperty(
                                        normProp, tokens,
                                        opt_naiveUriRewriter
                                            ? function (url) {
                                            return safeUri(
                                                url, html4.ueffects.SAME_DOCUMENT,
                                                html4.ltypes.SANDBOXED,
                                                {
                                                    "TYPE": "CSS",
                                                    "CSS_PROP": normProp
                                                }, opt_naiveUriRewriter);
                                        }
                                            : null);
                                    if (tokens.length) {
                                        sanitizedDeclarations.push(
                                            normProp + ': ' + tokens.join(' '));
                                    }
                                }
                            });
                        value = sanitizedDeclarations.length > 0 ?
                            sanitizedDeclarations.join(' ; ') : null;
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                    case html4.atype['ID']:
                    case html4.atype['IDREF']:
                    case html4.atype['IDREFS']:
                    case html4.atype['GLOBAL_NAME']:
                    case html4.atype['LOCAL_NAME']:
                    case html4.atype['CLASSES']:
                        value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                    case html4.atype['URI']:
                        value = safeUri(value,
                            getUriEffect(tagName, attribName),
                            getLoaderType(tagName, attribName),
                            {
                                "TYPE": "MARKUP",
                                "XML_ATTR": attribName,
                                "XML_TAG": tagName
                            }, opt_naiveUriRewriter);
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                    case html4.atype['URI_FRAGMENT']:
                        if (value && '#' === value.charAt(0)) {
                            value = value.substring(1);  // remove the leading '#'
                            value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
                            if (value !== null && value !== void 0) {
                                value = '#' + value;  // restore the leading '#'
                            }
                        } else {
                            value = null;
                        }
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                    default:
                        value = null;
                        if (opt_logger) {
                            log(opt_logger, tagName, attribName, oldValue, value);
                        }
                        break;
                }
            } else {
                value = null;
                if (opt_logger) {
                    log(opt_logger, tagName, attribName, oldValue, value);
                }
            }
            attribs[i + 1] = value;
        }
        return attribs;
    }

    /**
     * Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js
     * and applies the default attribute sanitizer with the supplied policy for
     * URI attributes and NMTOKEN attributes.
     * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
     *     apply to URI attributes.  If not given, URI attributes are deleted.
     * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
     *     to attributes containing HTML names, element IDs, and space-separated
     *     lists of classes.  If not given, such attributes are left unchanged.
     * @return {function(string, Array.<?string>)} A tagPolicy suitable for
     *     passing to html.sanitize.
     */
    function makeTagPolicy(
        opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
        return function(tagName, attribs) {
            if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) {
                return {
                    'attribs': sanitizeAttribs(tagName, attribs,
                        opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger)
                };
            } else {
                if (opt_logger) {
                    log(opt_logger, tagName, undefined, undefined, undefined);
                }
            }
        };
    }

    /**
     * Sanitizes HTML tags and attributes according to a given policy.
     * @param {string} inputHtml The HTML to sanitize.
     * @param {function(string, Array.<?string>)} tagPolicy A function that
     *     decides which tags to accept and sanitizes their attributes (see
     *     makeHtmlSanitizer above for details).
     * @return {string} The sanitized HTML.
     */
    function sanitizeWithPolicy(inputHtml, tagPolicy) {
        var outputArray = [];
        makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray);
        return outputArray.join('');
    }

    /**
     * Strips unsafe tags and attributes from HTML.
     * @param {string} inputHtml The HTML to sanitize.
     * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
     *     apply to URI attributes.  If not given, URI attributes are deleted.
     * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
     *     to attributes containing HTML names, element IDs, and space-separated
     *     lists of classes.  If not given, such attributes are left unchanged.
     */
    function sanitize(inputHtml,
        opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
        var tagPolicy = makeTagPolicy(
            opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
        return sanitizeWithPolicy(inputHtml, tagPolicy);
    }

    // Export both quoted and unquoted names for Closure linkage.
    var html = {};
    html.escapeAttrib = html['escapeAttrib'] = escapeAttrib;
    html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer;
    html.makeSaxParser = html['makeSaxParser'] = makeSaxParser;
    html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy;
    html.normalizeRCData = html['normalizeRCData'] = normalizeRCData;
    html.sanitize = html['sanitize'] = sanitize;
    html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs;
    html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy;
    html.unescapeEntities = html['unescapeEntities'] = unescapeEntities;
    return html;
})(html4);

var html_sanitize = html['sanitize'];

// Exports for Closure compiler.  Note this file is also cajoled
// for domado and run in an environment without 'window'
if (typeof window !== 'undefined') {
    window['html'] = html;
    window['html_sanitize'] = html_sanitize;
}

var Sanitizer = {};

// Ensure backwards compatibility
Sanitizer.escapeAttrib = html.escapeAttrib;
Sanitizer.makeHtmlSanitizer = html.makeHtmlSanitizer;
Sanitizer.makeSaxParser = html.makeSaxParser;
Sanitizer.makeTagPolicy = html.makeTagPolicy;
Sanitizer.normalizeRCData = html.normalizeRCData
Sanitizer.sanitizeAttribs = html.sanitizeAttribs
Sanitizer.sanitizeWithPolicy = html.sanitizeWithPolicy
Sanitizer.unescapeEntities = html.unescapeEntities
Sanitizer.escape = html.escapeAttrib;

// https://github.com/theSmaw/Caja-HTML-Sanitizer/issues/8
Sanitizer.sanitize = function(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
    if (typeof(inputHtml) === "string") {
        inputHtml = inputHtml.replace(/<([a-zA-Z]+)([^>]*)\/>/g, '<$1$2></$1>');
    }
    
    if (inputHtml) {
        return html.sanitize(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
    }
    else {
        return inputHtml;
    }
    
}

// the browser, add 'Sanitizer' as a global object via a string identifier,
// for Closure Compiler "advanced" mode.
if (typeof exports !== 'undefined') {
    if (typeof module !== 'undefined' && module.exports) {
        exports = module.exports = Sanitizer;
    }
    exports.Sanitizer = Sanitizer;
} else {
    this.Sanitizer = Sanitizer;
}