Skip to content

Commit

Permalink
add some html test case
Browse files Browse the repository at this point in the history
  • Loading branch information
jindw committed Jun 10, 2012
1 parent 03231bf commit b6fd947
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 72 deletions.
12 changes: 9 additions & 3 deletions dom-parser.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
function DOMParser(){

}
DOMParser.prototype.parseFromString = function(source){
DOMParser.prototype.parseFromString = function(source,mimeType){
var sax = new XMLReader();
var handler = new DOMHandler();
var defaultNSMap = {};
var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
sax.contentHandler = handler;
sax.lexicalHandler = handler;
sax.errorHandler = handler;

sax.parse(source);
if(/\/x?html?$/.test(mimeType)){
entityMap.nbsp = '\xa0';
entityMap.copy = '\xa9';
defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
}
sax.parse(source,defaultNSMap,entityMap);
return handler.document;
}
/**
Expand Down
9 changes: 5 additions & 4 deletions dom.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ function _extends(Class,Super){
pt.constructor = Class
}
}
var htmlns = 'http://www.w3.org/1999/xhtml' ;
// Node Types
var NodeType = {}
var ELEMENT_NODE = NodeType.ELEMENT_NODE = 1;
Expand All @@ -49,7 +50,6 @@ var DOCUMENT_TYPE_NODE = NodeType.DOCUMENT_TYPE_NODE = 10;
var DOCUMENT_FRAGMENT_NODE = NodeType.DOCUMENT_FRAGMENT_NODE = 11;
var NOTATION_NODE = NodeType.NOTATION_NODE = 12;


// ExceptionCode
var ExceptionCode = {}
var ExceptionMessage = {};
Expand Down Expand Up @@ -904,14 +904,15 @@ function serializeToString(node,buf){
var len = attrs.length;
var child = node.firstChild;
var nodeName = node.tagName;
var isHTML = htmlns === node.namespaceURI
buf.push('<',nodeName);
for(var i=0;i<len;i++){
serializeToString(attrs.item(i),buf);
serializeToString(attrs.item(i),buf,isHTML);
}
if(child){
if(child || isHTML && !/^(?:meta|link|img|br|hr|input)$/i.test(nodeName)){
buf.push('>');
//if is cdata child node
if(/^script$/i.test(nodeName)){
if(isHTML && /^script$/i.test(nodeName)){
buf.push(child.data);
}else{
while(child){
Expand Down
100 changes: 46 additions & 54 deletions sax.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,32 @@
//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
function XMLReader(){
}
//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
//[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
//[5] Name ::= NameStartChar (NameChar)*
var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u10000-\uEFFFF]/
var nameChar = new RegExp("[\-\.0-9"+nameStartChar.source.slice(1,-1)+"\u00B7\u0300-\u036F\ux203F-\u2040]");
var standardName = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$');
var tagNamePattern = standardName || /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/
var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF
var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\u00B7\u0300-\u036F\\ux203F-\u2040]");
var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$');
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/

//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
function XMLReader(){
}

XMLReader.prototype = {
parse:function(source){
parse:function(source,defaultNSMap,entityMap){
var contentHandler = this.contentHandler;
contentHandler.startDocument();
parse(source,this.entityMap,contentHandler,this.lexicalHandler,this.errorHandler);
_copy(defaultNSMap ,defaultNSMap = {})
parse(source,defaultNSMap,entityMap,
contentHandler,this.lexicalHandler,this.errorHandler);
contentHandler.endDocument();
},
entityMap:{'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'",
'nbsp':'&#160;','copy':'&#169;'}
}
}
function parse(source,entityMap,contentHandler,lexHandler,errorHandler){
function parse(source,defaultNSMapCopy,entityMap,contentHandler,lexHandler,errorHandler){
function entityReplacer(a){
var k = a.slice(1,-1);
if(k.charAt(0) == '#'){
return String.fromCharCode(parseInt(k.substr(1).replace('x','0x')))
}else if(k in entityMap){
if(k in entityMap){
return entityMap[k];
}else if(k.charAt(0) === '#'){
return String.fromCharCode(parseInt(k.substr(1).replace('x','0x')))
}else{
errorHandler.error('entity not found:'+a);
return a;
Expand All @@ -36,8 +37,7 @@ function parse(source,entityMap,contentHandler,lexHandler,errorHandler){
contentHandler.characters(xt,0,end-start);
start = end
}

var elStack = [{currentNSMap:{}}]
var elStack = [{currentNSMap:defaultNSMapCopy}]
var closeMap = {};
var start = 0;
while(true){
Expand Down Expand Up @@ -88,12 +88,13 @@ function parse(source,entityMap,contentHandler,lexHandler,errorHandler){
}
}
}
function parseSpecialContent(tagName,source,p,entityReplacer,contentHandler,lexHandler){
if(/^(?:script|textarea)$/i.test(tagName)){
function parseSpecialContent(el,source,p,entityReplacer,contentHandler,lexHandler){
var ns = el.uri;
var tagName = el.tagName;
if(ns === 'http://www.w3.org/1999/xhtml' &&/^(?:script|textarea)$/i.test(tagName)){
var end = source.indexOf('</'+tagName+'>',p);
var text = source.substring(p+1,end);
if(/[&<]/.test(text)){
//console.log('######',tagName,end,text)
if(/^script$/i.test(tagName)){
//if(!/\]\]>/.test(text)){
//lexHandler.startCDATA();
Expand Down Expand Up @@ -141,9 +142,6 @@ function parseElementAttribute(source,start,entityReplacer,contentHandler,lexHan
case '=':
if(s === 2){//attrName
attrName = source.slice(start,p);
if(!tagNamePattern.test(attrName)){
return -1;
}
s = 4;
}else if(s === 3){
s = 4;
Expand All @@ -159,27 +157,34 @@ function parseElementAttribute(source,start,entityReplacer,contentHandler,lexHan
switch(s){
case 0:
case 2:
attrName = source.slice(start,p)
case 3:
//reportWarning for s == 3
if(attrName.slice(-1) === '/'){
case 5:
var value = source.slice(start,p);
if(value.slice(-1) === '/'){
selfClosed = true;
attrName = attrName.slice(0,-1)
value = value.slice(0,-1)
}
if(attrName){
if(s){//only s == 2
el[index++] = {qName:attrName,value:attrName}
}else{
tagName = attrName;
if(value){
if(s == 2){
el[index++] = {qName:value,value:value}
}else if(s==5){
el[index++] = {qName:attrName,value:value}
}else if(s ==0){
tagName = value;
}
}
break;
case 1:
default:
//case 4://error
//case 4://error

}
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
el.length = index;
selfClosed = selfClosed||fixSelfClosed(closeMap,source,tagName,p)
appendElement(contentHandler,elStack,el,tagName,selfClosed);
return selfClosed ?p+1: parseSpecialContent(tagName,source,p,entityReplacer,contentHandler,lexHandler) || p+1;
return selfClosed ?p+1: parseSpecialContent(el,source,p,entityReplacer,contentHandler,lexHandler) || p+1;
/*xml space '\x20' | #x9 | #xD | #xA; */
case '\u0080':
c = ' ';
Expand Down Expand Up @@ -224,9 +229,10 @@ function fixSelfClosed(closeMap,source,tagName,p){
//if(tagName in closeMap){
var pos = closeMap[tagName];
if(pos == null){
pos = closeMap[tagName] = source.lastIndexOf('</'+tagName+'>',p)
//console.log(tagName)
pos = closeMap[tagName] = source.lastIndexOf('</'+tagName+'>')
}
return pos>p;
return pos<p;
//}
}
function appendElement(contentHandler,elStack,el,tagName,selfClosed){
Expand Down Expand Up @@ -262,12 +268,8 @@ function appendElement(contentHandler,elStack,el,tagName,selfClosed){
//prefix == null for no ns prefix attribute
if(nsPrefix !== false){//hack!!
if(localNSMap == null){
localNSMap = {};
for(var n in currentNSMap){
localNSMap[n] = currentNSMap[n];
}
currentNSMap = localNSMap;
localNSMap = {}
_copy(currentNSMap,currentNSMap={})
}
currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
a.uri = 'http://www.w3.org/2000/xmlns/'
Expand Down Expand Up @@ -295,7 +297,6 @@ function appendElement(contentHandler,elStack,el,tagName,selfClosed){
}
//no prefix element has default namespace
contentHandler.startElement(el.uri = currentNSMap[prefix || ''],localName,tagName,el);

if(selfClosed){
contentHandler.endElement(el.uri,localName,tagName);
}else{
Expand All @@ -310,7 +311,9 @@ function appendElement(contentHandler,elStack,el,tagName,selfClosed){
}
}
}

function _copy(source,target){
for(var n in source){target[n] = source[n]}
}
function parseDCC(source,start,contentHandler,lexHandler){//sure start with '<!'
var next= source.charAt(start+2)
switch(next){
Expand Down Expand Up @@ -423,17 +426,6 @@ function split(source,start){
}
}

function split2(source){
var match;
var buf = [];
var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g;
reg.lastIndex = 0;
reg.exec(source);//skip <
while(match = reg.exec(source)){
buf.push(match);
if(match[1])return buf;
}
}
if(typeof require == 'function'){
exports.XMLReader = XMLReader;
}
Expand Down
7 changes: 5 additions & 2 deletions test/dom/clone.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ wows.describe('XML Namespace Parse').addBatch({
'clone': function () {
var doc1 = new DOMParser().parseFromString("<doc1 attr1='1' attr2='a2'>text1<child>text2</child></doc1>",'text/xml')
var n =doc1.cloneNode(true)
console.log(new XMLSerializer().serializeToString(doc1))
console.assert(n == new XMLSerializer().serializeToString(doc1))
},
'import': function () {
var doc1 = new DOMParser().parseFromString("<doc2 attr='2'/>")
var doc2 = new DOMParser().parseFromString("<doc1 attr1='1' attr2='a2'>text1<child>text2</child></doc1>",'text/xml')

var doc3 = new DOMParser().parseFromString("<doc2 attr='2'><doc1 attr1='1' attr2='a2'>text1<child>text2</child></doc1></doc2>")
var n =doc1.importNode(doc2.documentElement, true)
doc1.documentElement.appendChild(n)
console.log(new XMLSerializer().serializeToString(doc1))
console.assert(doc1 == doc3+'')
console.assert(doc2 != doc3+'')
}
}).run(); // Run it
6 changes: 3 additions & 3 deletions test/dom/element.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ wows.describe('XML Namespace Parse').addBatch({



var feed = new DOMParser().parseFromString('<feed><entry>foo</entry></feed>');
var entries = feed.documentElement.getElementsByTagName('entry');
console.log(entries[0].nodeName);
var feed = new DOMParser().parseFromString('<feed><entry>foo</entry></feed>');
var entries = feed.documentElement.getElementsByTagName('entry');
console.log(entries[0].nodeName);
console.log(feed.documentElement.childNodes.item(0).nodeName);
},
'getElementsByTagNameNS': function () {
Expand Down
65 changes: 65 additions & 0 deletions test/html/normalize.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
var wows = require('vows');
var assert = require('assert');
var DOMParser = require('xmldom').DOMParser;
var XMLSerializer = require('xmldom').XMLSerializer;
var parser = new DOMParser();
// Create a Test Suite
wows.describe('html normalizer').addBatch({
'text & <': function () {
var dom = new DOMParser().parseFromString('<div>&amp;&lt;123&456<789;&&</div>','text/html');
console.assert(dom == '<div>&amp;&lt;123&amp;456&lt;789;&amp;&amp;</div>',dom+'')

var dom = new DOMParser().parseFromString('<div><123e>&<a<br/></div>','text/html');
console.assert(dom == '<div>&lt;123e>&amp;&lt;a<br/></div>',dom+'')

var dom = new DOMParser().parseFromString('<div>&nbsp;&copy;&nbsp&copy</div>','text/html');
console.assert(dom == '<div>\u00a0\u00a9&amp;nbsp&amp;copy</div>',dom+'')


},
'attr': function () {
var dom = new DOMParser().parseFromString('<div test="alert(\'<br/>\')"/>','text/html');
console.assert(dom == '<div test="alert(\'&lt;br/>\')"></div>',dom+'')
var dom = new DOMParser().parseFromString('<div test="a<b&&a< c && a>d"></div>','text/html');
console.assert(dom == '<div test="a&lt;b&amp;&amp;a&lt; c &amp;&amp; a>d"></div>',dom+'')

var dom = new DOMParser().parseFromString('<div a=& bb c d=123&&456/>','text/html');
console.assert(dom == '<div a="&amp;" bb="bb" c="c" d="123&amp;&amp;456"></div>',dom+'')

var dom = new DOMParser().parseFromString('<div a=& a="&\'\'" b/>','text/html');
console.assert(dom == '<div a="&amp;\'\'" b="b"></div>',dom+'')
},
"unclosed":function(){
var dom = new DOMParser().parseFromString('<html><meta><link><img><br><hr><input></html>','text/html');
console.assert(dom == '<html><meta/><link/><img/><br/><hr/><input/></html>',dom+'')

var dom = new DOMParser().parseFromString('<html title =1/2></html>','text/html');
console.assert(dom == '<html title="1/2"></html>',dom+'')

var dom = new DOMParser().parseFromString('<html title= 1/>','text/html');
console.assert(dom == '<html title="1"></html>',dom+'')

var dom = new DOMParser().parseFromString('<html title = 1/>','text/html');
console.assert(dom == '<html title="1"></html>',dom+'')

var dom = new DOMParser().parseFromString('<html title/>','text/html');
console.assert(dom == '<html title="title"></html>',dom+'')



var dom = new DOMParser().parseFromString('<html><meta><link><img><br><hr><input></html>','text/html');
console.assert(dom == '<html><meta/><link/><img/><br/><hr/><input/></html>',dom+'')


},
'script': function () {
var dom = new DOMParser().parseFromString('<script>alert(a<b&&c?"<br>":">>");</script>','text/html');
console.assert(dom == '<script>alert(a<b&&c?"<br>":">>");</script>',dom+'')
var dom = new DOMParser().parseFromString('<script>alert(a<b&&c?"<br/>":">>");</script>','text/html');
console.assert(dom == '<script>alert(a<b&&c?"<br/>":">>");</script>',dom+'')
},
'textarea': function () {
var dom = new DOMParser().parseFromString('<textarea>alert(a<b&&c?"<br>":">>");</textarea>','text/html');
console.assert(dom == '<textarea>alert(a&lt;b&amp;&amp;c?"&lt;br>":">>");</textarea>',dom+'')
}
}).run();
6 changes: 2 additions & 4 deletions test/parse-html.js → test/html/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@ var parser = new DOMParser();
wows.describe('html Node Parse').addBatch({
'node': function () {
var dom = new DOMParser().parseFromString('<html xmlns:x="1"><body/></html>','text/html');
console.log(dom+'',dom.documentElement.firstChild+'')
console.assert(dom == '<html xmlns:x="1"><body/></html>',dom+'')
console.assert(dom == '<html xmlns:x="1"><body></body></html>',dom+'')
},
'attr': function () {
var dom = new DOMParser().parseFromString('<html test="a<b && a>b && \'&amp;&&\'"/>','text/html');
console.log(dom+'')
console.assert(dom == '<html test="a&lt;b &amp;&amp; a>b &amp;&amp; \'&amp;&amp;&amp;\'"/>',dom+'')
console.assert(dom == '<html test="a&lt;b &amp;&amp; a>b &amp;&amp; \'&amp;&amp;&amp;\'"></html>',dom+'')
},
'script': function () {
var dom = new DOMParser().parseFromString('<script>alert(a<b&&c?"<br>":">>");</script>','text/html');
Expand Down
11 changes: 9 additions & 2 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ function format(s){
}
return result;
}
DOMParser.prototype.parseFromString = function(data){
var doc = oldParser.apply(this,arguments);
function check(data,doc){
var domjsresult = format(data);
var xmldomresult = new XMLSerializer().serializeToString(doc);
var xmldomresult2 = new XMLSerializer().serializeToString(doc.cloneNode(true));
Expand All @@ -30,11 +29,19 @@ DOMParser.prototype.parseFromString = function(data){
domjsresult = domjsresult.replace(/^<\?.*?\?>\s*|<!\[CDATA\[\]\]>/g,'')
//console.log('['+xmldomresult+'],['+domjsresult+']')
assert.equal(xmldomresult,domjsresult);
}
DOMParser.prototype.parseFromString = function(data,mimeType){
var doc = oldParser.apply(this,arguments);
if(!/\/x?html?\b/.test(mimeType)){
check(data,doc);
}
return doc;
}

require('./dom');
require('./parse-element');
require('./node');
require('./namespace');
require('./html/parse');
require('./html/normalize');
//require('./big-file-performance');

0 comments on commit b6fd947

Please sign in to comment.