tokenizer.js 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. module.exports = tokenize
  2. var through = require('through')
  3. var PSEUDOSTART = 'pseudo-start'
  4. , ATTR_START = 'attr-start'
  5. , ANY_CHILD = 'any-child'
  6. , ATTR_COMP = 'attr-comp'
  7. , ATTR_END = 'attr-end'
  8. , PSEUDOPSEUDO = '::'
  9. , PSEUDOCLASS = ':'
  10. , READY = '(ready)'
  11. , OPERATION = 'op'
  12. , CLASS = 'class'
  13. , COMMA = 'comma'
  14. , ATTR = 'attr'
  15. , SUBJECT = '!'
  16. , TAG = 'tag'
  17. , STAR = '*'
  18. , ID = 'id'
  19. function tokenize() {
  20. var escaped = false
  21. , gathered = []
  22. , state = READY
  23. , data = []
  24. , idx = 0
  25. , stream
  26. , length
  27. , quote
  28. , depth
  29. , lhs
  30. , rhs
  31. , cmp
  32. , c
  33. return stream = through(ondata, onend)
  34. function ondata(chunk) {
  35. data = data.concat(chunk.split(''))
  36. length = data.length
  37. while(idx < length && (c = data[idx++])) {
  38. switch(state) {
  39. case READY: state_ready(); break
  40. case ANY_CHILD: state_any_child(); break
  41. case OPERATION: state_op(); break
  42. case ATTR_START: state_attr_start(); break
  43. case ATTR_COMP: state_attr_compare(); break
  44. case ATTR_END: state_attr_end(); break
  45. case PSEUDOCLASS:
  46. case PSEUDOPSEUDO: state_pseudo(); break
  47. case PSEUDOSTART: state_pseudostart(); break
  48. case ID:
  49. case TAG:
  50. case CLASS: state_gather(); break
  51. }
  52. }
  53. data = data.slice(idx)
  54. }
  55. function onend(chunk) {
  56. if(arguments.length) {
  57. ondata(chunk)
  58. }
  59. if(gathered.length) {
  60. stream.queue(token())
  61. }
  62. }
  63. function state_ready() {
  64. switch(true) {
  65. case '#' === c: state = ID; break
  66. case '.' === c: state = CLASS; break
  67. case ':' === c: state = PSEUDOCLASS; break
  68. case '[' === c: state = ATTR_START; break
  69. case '!' === c: subject(); break
  70. case '*' === c: star(); break
  71. case ',' === c: comma(); break
  72. case /[>\+~]/.test(c): state = OPERATION; break
  73. case /\s/.test(c): state = ANY_CHILD; break
  74. case /[\w\d\-_]/.test(c): state = TAG; --idx; break
  75. }
  76. }
  77. function subject() {
  78. state = SUBJECT
  79. gathered = ['!']
  80. stream.queue(token())
  81. state = READY
  82. }
  83. function star() {
  84. state = STAR
  85. gathered = ['*']
  86. stream.queue(token())
  87. state = READY
  88. }
  89. function comma() {
  90. state = COMMA
  91. gathered = [',']
  92. stream.queue(token())
  93. state = READY
  94. }
  95. function state_op() {
  96. if(/[>\+~]/.test(c)) {
  97. return gathered.push(c)
  98. }
  99. // chomp down the following whitespace.
  100. if(/\s/.test(c)) {
  101. return
  102. }
  103. stream.queue(token())
  104. state = READY
  105. --idx
  106. }
  107. function state_any_child() {
  108. if(/\s/.test(c)) {
  109. return
  110. }
  111. if(/[>\+~]/.test(c)) {
  112. return --idx, state = OPERATION
  113. }
  114. stream.queue(token())
  115. state = READY
  116. --idx
  117. }
  118. function state_pseudo() {
  119. rhs = state
  120. state_gather(true)
  121. if(state !== READY) {
  122. return
  123. }
  124. if(c === '(') {
  125. lhs = gathered.join('')
  126. state = PSEUDOSTART
  127. gathered.length = 0
  128. depth = 1
  129. ++idx
  130. return
  131. }
  132. state = PSEUDOCLASS
  133. stream.queue(token())
  134. state = READY
  135. }
  136. function state_pseudostart() {
  137. if(gathered.length === 0 && !quote) {
  138. quote = /['"]/.test(c) ? c : null
  139. if(quote) {
  140. return
  141. }
  142. }
  143. if(quote) {
  144. if(!escaped && c === quote) {
  145. quote = null
  146. return
  147. }
  148. if(c === '\\') {
  149. escaped ? gathered.push(c) : (escaped = true)
  150. return
  151. }
  152. escaped = false
  153. gathered.push(c)
  154. return
  155. }
  156. gathered.push(c)
  157. if(c === '(') {
  158. ++depth
  159. } else if(c === ')') {
  160. --depth
  161. }
  162. if(!depth) {
  163. gathered.pop()
  164. stream.queue({
  165. type: rhs
  166. , data: lhs + '(' + gathered.join('') + ')'
  167. })
  168. state = READY
  169. lhs = rhs = cmp = null
  170. gathered.length = 0
  171. }
  172. return
  173. }
  174. function state_attr_start() {
  175. state_gather(true)
  176. if(state !== READY) {
  177. return
  178. }
  179. if(c === ']') {
  180. state = ATTR
  181. stream.queue(token())
  182. state = READY
  183. return
  184. }
  185. lhs = gathered.join('')
  186. gathered.length = 0
  187. state = ATTR_COMP
  188. }
  189. function state_attr_compare() {
  190. if(/[=~|$^*]/.test(c)) {
  191. gathered.push(c)
  192. }
  193. if(gathered.length === 2 || c === '=') {
  194. cmp = gathered.join('')
  195. gathered.length = 0
  196. state = ATTR_END
  197. quote = null
  198. return
  199. }
  200. }
  201. function state_attr_end() {
  202. if(!gathered.length && !quote) {
  203. quote = /['"]/.test(c) ? c : null
  204. if(quote) {
  205. return
  206. }
  207. }
  208. if(quote) {
  209. if(!escaped && c === quote) {
  210. quote = null
  211. return
  212. }
  213. if(c === '\\') {
  214. if(escaped) {
  215. gathered.push(c)
  216. }
  217. escaped = !escaped
  218. return
  219. }
  220. escaped = false
  221. gathered.push(c)
  222. return
  223. }
  224. state_gather(true)
  225. if(state !== READY) {
  226. return
  227. }
  228. stream.queue({
  229. type: ATTR
  230. , data: {
  231. lhs: lhs
  232. , rhs: gathered.join('')
  233. , cmp: cmp
  234. }
  235. })
  236. state = READY
  237. lhs = rhs = cmp = null
  238. gathered.length = 0
  239. return
  240. }
  241. function state_gather(quietly) {
  242. if(/[^\d\w\-_]/.test(c) && !escaped) {
  243. if(c === '\\') {
  244. escaped = true
  245. } else {
  246. !quietly && stream.queue(token())
  247. state = READY
  248. --idx
  249. }
  250. return
  251. }
  252. escaped = false
  253. gathered.push(c)
  254. }
  255. function token() {
  256. var data = gathered.join('')
  257. gathered.length = 0
  258. return {
  259. type: state
  260. , data: data
  261. }
  262. }
  263. }