source: trunk/modules/GetHtml/TransformInnerHTML.js @ 1344

Last change on this file since 1344 was 1344, checked in by gogo, 7 months ago

#1568 - Fix some 7 bit clean URLs with TransformInnerHTML

  • Property svn:keywords set to LastChangedDate LastChangedRevision LastChangedBy HeadURL Id
File size: 9.9 KB
Line 
1/**
2  * Based on XML_Utility functions submitted by troels_kn.
3  * credit also to adios, who helped with reg exps:
4  * http://www.sitepoint.com/forums/showthread.php?t=201052
5  *
6  * A replacement for Xinha.getHTML
7  *
8  * Features:
9  *   - Generates XHTML code
10  *   - Much faster than Xinha.getHTML
11  *   - Eliminates the hacks to accomodate browser quirks
12  *   - Returns correct code for Flash objects and scripts
13  *   - Formats html in an indented, readable format in html mode
14  *   - Preserves script and pre formatting
15  *   - Preserves formatting in comments
16  *   - Removes contenteditable from body tag in full-page mode
17  *   - Supports only7BitPrintablesInURLs config option
18  *   - Supports htmlRemoveTags config option
19  */
20 
21function GetHtmlImplementation(editor) {
22    this.editor = editor;
23}
24
25GetHtmlImplementation._pluginInfo = {
26        name          : "GetHtmlImplementation TransformInnerHTML",
27        version       : "1.0",
28        developer     : "Nelson Bright",
29        developer_url : "http://www.brightworkweb.com/",
30        sponsor       : "",
31    sponsor_url   : "",
32        license       : "htmlArea"
33};
34
35Xinha.RegExpCache = [
36/*00*/  /<\s*\/?([^\s\/>]+)[\s*\/>]/gi,//lowercase tags
37/*01*/  /(\s+)_moz[^=>]*=[^\s>]*/gi,//strip _moz attributes
38/*02*/  /\s*=\s*(([^'"][^>\s]*)([>\s])|"([^"]+)"|'([^']+)')/g,// find attributes
39/*03*/  /\/>/g,//strip singlet terminators
40/*04*/  /<(br|hr|img|input|link|meta|param|embed|area)((\s*\S*="[^"]*")*)>/g,//terminate singlet tags
41/*05*/  /(<\w+\s+(\w*="[^"]*"\s+)*)(checked|compact|declare|defer|disabled|ismap|multiple|no(href|resize|shade|wrap)|readonly|selected)([\s>])/gi,//expand singlet attributes
42/*06*/  /(="[^']*)'([^'"]*")/,//check quote nesting
43/*07*/  /&(?=(?!(#[0-9]{2,5};|[a-zA-Z0-9]{2,6};|#x[0-9a-fA-F]{2,4};))[^<]*>)/g,//expand query ampersands not in html entities
44/*08*/  /<\s+/g,//strip tagstart whitespace
45/*09*/  /\s+(\/)?>/g,//trim whitespace
46/*10*/  /\s{2,}/g,//trim extra whitespace
47/*11*/  /\s+([^=\s]+)((="[^"]+")|([\s>]))/g,// lowercase attribute names
48/*12*/  /\s+contenteditable(=[^>\s\/]*)?/gi,//strip contenteditable
49/*13*/  /((href|src)=")([^\s]*)"/g, //find href and src for stripBaseHref()
50/*14*/  /<\/?(div|p|h[1-6]|table|tr|td|th|ul|ol|li|dl|dt|dd|blockquote|object|br|hr|img|embed|param|pre|script|html|head|body|meta|link|title|area|input|form|textarea|select|option)[^>]*>/g,
51/*15*/  /<\/(div|p|h[1-6]|table|tr|ul|ol|dl|blockquote|html|head|body|script|form|select)( [^>]*)?>/g,//blocklevel closing tag
52/*16*/  /<(div|p|h[1-6]|table|tr|ul|ol|dl|blockquote|object|html|head|body|script|form|select)( [^>]*)?>/g,//blocklevel opening tag
53/*17*/  /<(td|th|li|dt|dd|option|br|hr|embed|param|pre|meta|link|title|area|input|textarea)[^>]*>/g,//singlet tag or output on 1 line
54/*18*/  /(^|<\/(pre|script)>)(\s|[^\s])*?(<(pre|script)[^>]*>|$)/g,//find content NOT inside pre and script tags
55/*19*/  /(<pre[^>]*>)([\s\S])*?(<\/pre>)/g,//find content inside pre tags
56/*20*/  /(^|<!--[\s\S]*?-->)([\s\S]*?)(?=<!--[\s\S]*?-->|$)/g,//find content NOT inside comments
57/*21*/  /\S*=""/g, //find empty attributes
58/*22*/  /<!--[\s\S]*?-->|<\?[\s\S]*?\?>|<\/?\w[^>]*>/g, //find all tags, including comments and php
59/*23*/  /(^|<\/script>)[\s\S]*?(<script[^>]*>|$)/g //find content NOT inside script tags
60];
61// compile for performance; WebKit doesn't support this
62var testRE = new RegExp().compile(Xinha.RegExpCache[3]);
63if (typeof testRE != 'undefined') {
64        for (var i=0; i<Xinha.RegExpCache.length;i++ ) {
65                Xinha.RegExpCache[i] = new RegExp().compile(Xinha.RegExpCache[i]);
66        }
67}
68
69/**
70  * Cleans HTML into wellformed xhtml
71  */
72Xinha.prototype.cleanHTML = function(sHtml) {
73        var c = Xinha.RegExpCache;
74        sHtml = sHtml.
75                replace(c[0], function(str) { return str.toLowerCase(); } ).//lowercase tags/attribute names
76                replace(c[1], ' ').//strip _moz attributes
77                replace(c[12], ' ').//strip contenteditable
78                replace(c[2], '="$2$4$5"$3').//add attribute quotes
79                replace(c[21], ' ').//strip empty attributes
80                replace(c[11], function(str, p1, p2) { return ' '+p1.toLowerCase()+p2; }).//lowercase attribute names
81                replace(c[3], '>').//strip singlet terminators
82                replace(c[9], '$1>').//trim whitespace
83                replace(c[5], '$1$3="$3"$5').//expand singlet attributes
84                replace(c[4], '<$1$2 />').//terminate singlet tags
85                replace(c[6], '$1$2').//check quote nesting
86                replace(c[7], '&amp;').//expand query ampersands
87                replace(c[8], '<').//strip tagstart whitespace
88                replace(c[10], ' ');//trim extra whitespace
89        if(Xinha.is_ie && c[13].test(sHtml)) {
90          sHtml = sHtml.replace(c[13],'$1'+Xinha._escapeDollars(this.stripBaseURL(RegExp.$3))+'"');
91        }
92
93        if(this.config.only7BitPrintablesInURLs) {
94                if (Xinha.is_ie) c[13].test(sHtml); // oddly the test below only triggers when we call this once before (IE6), in Moz it fails if tested twice
95                if ( c[13].test(sHtml)) {
96                        try { //Mozilla returns an incorrectly encoded value with innerHTML
97                          sHtml = sHtml.replace(c[13], '$1'+Xinha._escapeDollars(decodeURIComponent(RegExp.$3).replace(/([^!-~]+)/g, function(chr)
98                                                                                                                       {return escape(chr);}))+'"');
99                        } catch (e) { // once the URL is escape()ed, you can't decodeURIComponent() it anymore
100                          sHtml = sHtml.replace(c[13], '$1'+Xinha._escapeDollars(RegExp.$3.replace(/([^!-~]+)/g,function(chr){return escape(chr);})+'"'));
101                        }
102                }
103        }
104        return sHtml;
105};
106
107/**
108  * Prettyfies html by inserting linebreaks before tags, and indenting blocklevel tags
109  */
110Xinha.indent = function(s, sindentChar) {
111        Xinha.__nindent = 0;
112        Xinha.__sindent = "";
113        Xinha.__sindentChar = (typeof sindentChar == "undefined") ? "  " : sindentChar;
114        var c = Xinha.RegExpCache;
115        if(Xinha.is_gecko) { //moz changes returns into <br> inside <pre> tags
116                s = s.replace(c[19], function(str){return str.replace(/<br \/>/g,"\n")});
117        }
118        s = s.replace(c[18], function(strn) { //skip pre and script tags
119          strn = strn.replace(c[20], function(st,$1,$2) { //exclude comments
120                string = $2.replace(/[\n\r]/gi, " ").replace(/\s+/gi," ").replace(c[14], function(str) {
121                        if (str.match(c[16])) {
122                                var s = "\n" + Xinha.__sindent + str;
123                                // blocklevel openingtag - increase indent
124                                Xinha.__sindent += Xinha.__sindentChar;
125                                ++Xinha.__nindent;
126                                return s;
127                        } else if (str.match(c[15])) {
128                                // blocklevel closingtag - decrease indent
129                                --Xinha.__nindent;
130                                Xinha.__sindent = "";
131                                for (var i=Xinha.__nindent;i>0;--i) {
132                                        Xinha.__sindent += Xinha.__sindentChar;
133                                }
134                                return "\n" + Xinha.__sindent + str;
135                        } else if (str.match(c[17])) {
136                                // singlet tag
137                                return "\n" + Xinha.__sindent + str;
138                        }
139                        return str; // this won't actually happen
140                });
141                return $1 + string;
142          });return strn;
143    });
144    //final cleanup
145    s = s.replace(/^\s*/,'').//strip leading whitespace
146        replace(/ +\n/g,'\n').//strip spaces at end of lines
147        replace(/[\r\n]+(\s+)<\/script>/g,'\n$1</script>');//strip returns added into scripts
148    return s;
149};
150
151Xinha.getHTML = function(root, outputRoot, editor) {
152        var html = "";
153        var c = Xinha.RegExpCache;
154
155        if(root.nodeType == 11) {//document fragment
156            //we can't get innerHTML from the root (type 11) node, so we
157            //copy all the child nodes into a new div and get innerHTML from the div
158            var div = document.createElement("div");
159            var temp = root.insertBefore(div,root.firstChild);
160            for (j = temp.nextSibling; j; j = j.nextSibling) {
161                        temp.appendChild(j.cloneNode(true));
162            }
163                html += temp.innerHTML.replace(c[23], function(strn) { //skip content inside script tags
164                        strn = strn.replace(c[22], function(tag){
165                                if(/^<[!\?]/.test(tag)) return tag; //skip comments and php tags
166                                else return editor.cleanHTML(tag)});
167                        return strn;
168                });
169
170        } else {
171
172                var root_tag = (root.nodeType == 1) ? root.tagName.toLowerCase() : '';
173                if (outputRoot) { //only happens with <html> tag in fullpage mode
174                        html += "<" + root_tag;
175                        var attrs = root.attributes; // strangely, this doesn't work in moz
176                        for (i = 0; i < attrs.length; ++i) {
177                                var a = attrs.item(i);
178                                if (!a.specified) {
179                                  continue;
180                                }
181                                var name = a.nodeName.toLowerCase();
182                                var value = a.nodeValue;
183                                html += " " + name + '="' + value + '"';
184                        }
185                        html += ">";
186                }
187                if(root_tag == "html") {
188                        innerhtml = editor._doc.documentElement.innerHTML;
189                } else {
190                        innerhtml = root.innerHTML;
191                }
192                //pass tags to cleanHTML() one at a time
193                //includes support for htmlRemoveTags config option
194                html += innerhtml.replace(c[23], function(strn) { //skip content inside script tags
195                        strn = strn.replace(c[22], function(tag){
196                                if(/^<[!\?]/.test(tag)) return tag; //skip comments and php tags
197                                else if(!(editor.config.htmlRemoveTags && editor.config.htmlRemoveTags.test(tag.replace(/<([^\s>\/]+)/,'$1'))))
198                                        return editor.cleanHTML(tag);
199                                else return ''});
200                        return strn;
201                });
202                //IE drops  all </li>,</dt>,</dd> tags in a list except the last one
203                if(Xinha.is_ie) {
204                        html = html.replace(/<(li|dd|dt)( [^>]*)?>/g,'</$1><$1$2>').
205                                replace(/(<[uod]l[^>]*>[\s\S]*?)<\/(li|dd|dt)>/g, '$1').
206                                replace(/\s*<\/(li|dd|dt)>(\s*<\/(li|dd|dt)>)+/g, '</$1>').
207                                replace(/(<dt[\s>][\s\S]*?)(<\/d[dt]>)+/g, '$1</dt>');
208                }
209                if(Xinha.is_gecko)
210                        html = html.replace(/<br \/>\n$/, ''); //strip trailing <br> added by moz
211                //Cleanup redundant whitespace before </li></dd></dt> in IE and Mozilla
212                html = html.replace(/\s*(<\/(li|dd|dt)>)/g, '$1');
213                if (outputRoot) {
214                        html += "</" + root_tag + ">";
215                }
216                html = Xinha.indent(html);
217        };
218//      html = Xinha.htmlEncode(html);
219
220        return html;
221};
222
223/**
224  * Escapes dollar signs ($) to make them safe to use in regex replacement functions by replacing each $ in the input with $$.
225  *
226  * This is advisable any time the replacement string for a call to replace() is a variable and could contain dollar signs that should not be interpreted as references to captured groups (e.g., when you want the text "$10" and not the first captured group followed by a 0).
227  * See http://trac.xinha.org/ticket/1337
228  */
229Xinha._escapeDollars = function(str) {
230  return str.replace(/\$/g, "$$$$");
231};
Note: See TracBrowser for help on using the repository browser.