parser.js 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /**
  2. * Simple HTML Parser
  3. *
  4. * @author Zongmin Lei<leizongmin@gmail.com>
  5. */
  6. var _ = require("./util");
  7. /**
  8. * get tag name
  9. *
  10. * @param {String} html e.g. '<a hef="#">'
  11. * @return {String}
  12. */
  13. function getTagName(html) {
  14. var i = _.spaceIndex(html);
  15. if (i === -1) {
  16. var tagName = html.slice(1, -1);
  17. } else {
  18. var tagName = html.slice(1, i + 1);
  19. }
  20. tagName = _.trim(tagName).toLowerCase();
  21. if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
  22. if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
  23. return tagName;
  24. }
  25. /**
  26. * is close tag?
  27. *
  28. * @param {String} html 如:'<a hef="#">'
  29. * @return {Boolean}
  30. */
  31. function isClosing(html) {
  32. return html.slice(0, 2) === "</";
  33. }
  34. /**
  35. * parse input html and returns processed html
  36. *
  37. * @param {String} html
  38. * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
  39. * @param {Function} escapeHtml
  40. * @return {String}
  41. */
  42. function parseTag(html, onTag, escapeHtml) {
  43. "use strict";
  44. var rethtml = "";
  45. var lastPos = 0;
  46. var tagStart = false;
  47. var quoteStart = false;
  48. var currentPos = 0;
  49. var len = html.length;
  50. var currentTagName = "";
  51. var currentHtml = "";
  52. chariterator: for (currentPos = 0; currentPos < len; currentPos++) {
  53. var c = html.charAt(currentPos);
  54. if (tagStart === false) {
  55. if (c === "<") {
  56. tagStart = currentPos;
  57. continue;
  58. }
  59. } else {
  60. if (quoteStart === false) {
  61. if (c === "<") {
  62. rethtml += escapeHtml(html.slice(lastPos, currentPos));
  63. tagStart = currentPos;
  64. lastPos = currentPos;
  65. continue;
  66. }
  67. if (c === ">") {
  68. rethtml += escapeHtml(html.slice(lastPos, tagStart));
  69. currentHtml = html.slice(tagStart, currentPos + 1);
  70. currentTagName = getTagName(currentHtml);
  71. rethtml += onTag(
  72. tagStart,
  73. rethtml.length,
  74. currentTagName,
  75. currentHtml,
  76. isClosing(currentHtml)
  77. );
  78. lastPos = currentPos + 1;
  79. tagStart = false;
  80. continue;
  81. }
  82. if (c === '"' || c === "'") {
  83. var i = 1;
  84. var ic = html.charAt(currentPos - i);
  85. while (ic.trim() === "" || ic === "=") {
  86. if (ic === "=") {
  87. quoteStart = c;
  88. continue chariterator;
  89. }
  90. ic = html.charAt(currentPos - ++i);
  91. }
  92. }
  93. } else {
  94. if (c === quoteStart) {
  95. quoteStart = false;
  96. continue;
  97. }
  98. }
  99. }
  100. }
  101. if (lastPos < html.length) {
  102. rethtml += escapeHtml(html.substr(lastPos));
  103. }
  104. return rethtml;
  105. }
  106. var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9_:\.\-]/gim;
  107. /**
  108. * parse input attributes and returns processed attributes
  109. *
  110. * @param {String} html e.g. `href="#" target="_blank"`
  111. * @param {Function} onAttr e.g. `function (name, value)`
  112. * @return {String}
  113. */
  114. function parseAttr(html, onAttr) {
  115. "use strict";
  116. var lastPos = 0;
  117. var retAttrs = [];
  118. var tmpName = false;
  119. var len = html.length;
  120. function addAttr(name, value) {
  121. name = _.trim(name);
  122. name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
  123. if (name.length < 1) return;
  124. var ret = onAttr(name, value || "");
  125. if (ret) retAttrs.push(ret);
  126. }
  127. // 逐个分析字符
  128. for (var i = 0; i < len; i++) {
  129. var c = html.charAt(i);
  130. var v, j;
  131. if (tmpName === false && c === "=") {
  132. tmpName = html.slice(lastPos, i);
  133. lastPos = i + 1;
  134. continue;
  135. }
  136. if (tmpName !== false) {
  137. if (
  138. i === lastPos &&
  139. (c === '"' || c === "'") &&
  140. html.charAt(i - 1) === "="
  141. ) {
  142. j = html.indexOf(c, i + 1);
  143. if (j === -1) {
  144. break;
  145. } else {
  146. v = _.trim(html.slice(lastPos + 1, j));
  147. addAttr(tmpName, v);
  148. tmpName = false;
  149. i = j;
  150. lastPos = i + 1;
  151. continue;
  152. }
  153. }
  154. }
  155. if (/\s|\n|\t/.test(c)) {
  156. html = html.replace(/\s|\n|\t/g, " ");
  157. if (tmpName === false) {
  158. j = findNextEqual(html, i);
  159. if (j === -1) {
  160. v = _.trim(html.slice(lastPos, i));
  161. addAttr(v);
  162. tmpName = false;
  163. lastPos = i + 1;
  164. continue;
  165. } else {
  166. i = j - 1;
  167. continue;
  168. }
  169. } else {
  170. j = findBeforeEqual(html, i - 1);
  171. if (j === -1) {
  172. v = _.trim(html.slice(lastPos, i));
  173. v = stripQuoteWrap(v);
  174. addAttr(tmpName, v);
  175. tmpName = false;
  176. lastPos = i + 1;
  177. continue;
  178. } else {
  179. continue;
  180. }
  181. }
  182. }
  183. }
  184. if (lastPos < html.length) {
  185. if (tmpName === false) {
  186. addAttr(html.slice(lastPos));
  187. } else {
  188. addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
  189. }
  190. }
  191. return _.trim(retAttrs.join(" "));
  192. }
  193. function findNextEqual(str, i) {
  194. for (; i < str.length; i++) {
  195. var c = str[i];
  196. if (c === " ") continue;
  197. if (c === "=") return i;
  198. return -1;
  199. }
  200. }
  201. function findBeforeEqual(str, i) {
  202. for (; i > 0; i--) {
  203. var c = str[i];
  204. if (c === " ") continue;
  205. if (c === "=") return i;
  206. return -1;
  207. }
  208. }
  209. function isQuoteWrapString(text) {
  210. if (
  211. (text[0] === '"' && text[text.length - 1] === '"') ||
  212. (text[0] === "'" && text[text.length - 1] === "'")
  213. ) {
  214. return true;
  215. } else {
  216. return false;
  217. }
  218. }
  219. function stripQuoteWrap(text) {
  220. if (isQuoteWrapString(text)) {
  221. return text.substr(1, text.length - 2);
  222. } else {
  223. return text;
  224. }
  225. }
  226. exports.parseTag = parseTag;
  227. exports.parseAttr = parseAttr;