workbook-reader.js 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. const fs = require('fs');
  2. const {EventEmitter} = require('events');
  3. const {PassThrough, Readable} = require('readable-stream');
  4. const nodeStream = require('stream');
  5. const unzip = require('unzipper');
  6. const tmp = require('tmp');
  7. const iterateStream = require('../../utils/iterate-stream');
  8. const parseSax = require('../../utils/parse-sax');
  9. const StyleManager = require('../../xlsx/xform/style/styles-xform');
  10. const WorkbookXform = require('../../xlsx/xform/book/workbook-xform');
  11. const RelationshipsXform = require('../../xlsx/xform/core/relationships-xform');
  12. const WorksheetReader = require('./worksheet-reader');
  13. const HyperlinkReader = require('./hyperlink-reader');
  14. tmp.setGracefulCleanup();
  15. class WorkbookReader extends EventEmitter {
  16. constructor(input, options = {}) {
  17. super();
  18. this.input = input;
  19. this.options = {
  20. worksheets: 'emit',
  21. sharedStrings: 'cache',
  22. hyperlinks: 'ignore',
  23. styles: 'ignore',
  24. entries: 'ignore',
  25. ...options,
  26. };
  27. this.styles = new StyleManager();
  28. this.styles.init();
  29. }
  30. _getStream(input) {
  31. if (input instanceof nodeStream.Readable || input instanceof Readable) {
  32. return input;
  33. }
  34. if (typeof input === 'string') {
  35. return fs.createReadStream(input);
  36. }
  37. throw new Error(`Could not recognise input: ${input}`);
  38. }
  39. async read(input, options) {
  40. try {
  41. for await (const {eventType, value} of this.parse(input, options)) {
  42. switch (eventType) {
  43. case 'shared-strings':
  44. this.emit(eventType, value);
  45. break;
  46. case 'worksheet':
  47. this.emit(eventType, value);
  48. await value.read();
  49. break;
  50. case 'hyperlinks':
  51. this.emit(eventType, value);
  52. break;
  53. }
  54. }
  55. this.emit('end');
  56. this.emit('finished');
  57. } catch (error) {
  58. this.emit('error', error);
  59. }
  60. }
  61. async *[Symbol.asyncIterator]() {
  62. for await (const {eventType, value} of this.parse()) {
  63. if (eventType === 'worksheet') {
  64. yield value;
  65. }
  66. }
  67. }
  68. async *parse(input, options) {
  69. if (options) this.options = options;
  70. const stream = (this.stream = this._getStream(input || this.input));
  71. const zip = unzip.Parse({forceStream: true});
  72. stream.pipe(zip);
  73. // worksheets, deferred for parsing after shared strings reading
  74. const waitingWorkSheets = [];
  75. for await (const entry of iterateStream(zip)) {
  76. let match;
  77. let sheetNo;
  78. switch (entry.path) {
  79. case '_rels/.rels':
  80. break;
  81. case 'xl/_rels/workbook.xml.rels':
  82. await this._parseRels(entry);
  83. break;
  84. case 'xl/workbook.xml':
  85. await this._parseWorkbook(entry);
  86. break;
  87. case 'xl/sharedStrings.xml':
  88. yield* this._parseSharedStrings(entry);
  89. break;
  90. case 'xl/styles.xml':
  91. await this._parseStyles(entry);
  92. break;
  93. default:
  94. if (entry.path.match(/xl\/worksheets\/sheet\d+[.]xml/)) {
  95. match = entry.path.match(/xl\/worksheets\/sheet(\d+)[.]xml/);
  96. sheetNo = match[1];
  97. if (this.sharedStrings && this.workbookRels) {
  98. yield* this._parseWorksheet(iterateStream(entry), sheetNo);
  99. } else {
  100. // create temp file for each worksheet
  101. await new Promise((resolve, reject) => {
  102. tmp.file((err, path, fd, tempFileCleanupCallback) => {
  103. if (err) {
  104. return reject(err);
  105. }
  106. waitingWorkSheets.push({sheetNo, path, tempFileCleanupCallback});
  107. const tempStream = fs.createWriteStream(path);
  108. entry.pipe(tempStream);
  109. return tempStream.on('finish', () => {
  110. return resolve();
  111. });
  112. });
  113. });
  114. }
  115. } else if (entry.path.match(/xl\/worksheets\/_rels\/sheet\d+[.]xml.rels/)) {
  116. match = entry.path.match(/xl\/worksheets\/_rels\/sheet(\d+)[.]xml.rels/);
  117. sheetNo = match[1];
  118. yield* this._parseHyperlinks(iterateStream(entry), sheetNo);
  119. }
  120. break;
  121. }
  122. entry.autodrain();
  123. }
  124. for (const {sheetNo, path, tempFileCleanupCallback} of waitingWorkSheets) {
  125. let fileStream = fs.createReadStream(path);
  126. // TODO: Remove once node v8 is deprecated
  127. // Detect and upgrade old fileStreams
  128. if (!fileStream[Symbol.asyncIterator]) {
  129. fileStream = fileStream.pipe(new PassThrough());
  130. }
  131. yield* this._parseWorksheet(fileStream, sheetNo);
  132. tempFileCleanupCallback();
  133. }
  134. }
  135. _emitEntry(payload) {
  136. if (this.options.entries === 'emit') {
  137. this.emit('entry', payload);
  138. }
  139. }
  140. async _parseRels(entry) {
  141. const xform = new RelationshipsXform();
  142. this.workbookRels = await xform.parseStream(iterateStream(entry));
  143. }
  144. async _parseWorkbook(entry) {
  145. this._emitEntry({type: 'workbook'});
  146. const workbook = new WorkbookXform();
  147. await workbook.parseStream(iterateStream(entry));
  148. this.properties = workbook.map.workbookPr;
  149. this.model = workbook.model;
  150. }
  151. async *_parseSharedStrings(entry) {
  152. this._emitEntry({type: 'shared-strings'});
  153. switch (this.options.sharedStrings) {
  154. case 'cache':
  155. this.sharedStrings = [];
  156. break;
  157. case 'emit':
  158. break;
  159. default:
  160. return;
  161. }
  162. let text = null;
  163. let richText = [];
  164. let index = 0;
  165. let font = null;
  166. for await (const events of parseSax(iterateStream(entry))) {
  167. for (const {eventType, value} of events) {
  168. if (eventType === 'opentag') {
  169. const node = value;
  170. switch (node.name) {
  171. case 'b':
  172. font = font || {};
  173. font.bold = true;
  174. break;
  175. case 'charset':
  176. font = font || {};
  177. font.charset = parseInt(node.attributes.charset, 10);
  178. break;
  179. case 'color':
  180. font = font || {};
  181. font.color = {};
  182. if (node.attributes.rgb) {
  183. font.color.argb = node.attributes.argb;
  184. }
  185. if (node.attributes.val) {
  186. font.color.argb = node.attributes.val;
  187. }
  188. if (node.attributes.theme) {
  189. font.color.theme = node.attributes.theme;
  190. }
  191. break;
  192. case 'family':
  193. font = font || {};
  194. font.family = parseInt(node.attributes.val, 10);
  195. break;
  196. case 'i':
  197. font = font || {};
  198. font.italic = true;
  199. break;
  200. case 'outline':
  201. font = font || {};
  202. font.outline = true;
  203. break;
  204. case 'rFont':
  205. font = font || {};
  206. font.name = node.value;
  207. break;
  208. case 'si':
  209. font = null;
  210. richText = [];
  211. text = null;
  212. break;
  213. case 'sz':
  214. font = font || {};
  215. font.size = parseInt(node.attributes.val, 10);
  216. break;
  217. case 'strike':
  218. break;
  219. case 't':
  220. text = null;
  221. break;
  222. case 'u':
  223. font = font || {};
  224. font.underline = true;
  225. break;
  226. case 'vertAlign':
  227. font = font || {};
  228. font.vertAlign = node.attributes.val;
  229. break;
  230. }
  231. } else if (eventType === 'text') {
  232. text = text ? text + value : value;
  233. } else if (eventType === 'closetag') {
  234. const node = value;
  235. switch (node.name) {
  236. case 'r':
  237. richText.push({
  238. font,
  239. text,
  240. });
  241. font = null;
  242. text = null;
  243. break;
  244. case 'si':
  245. if (this.options.sharedStrings === 'cache') {
  246. this.sharedStrings.push(richText.length ? {richText} : text);
  247. } else if (this.options.sharedStrings === 'emit') {
  248. yield {index: index++, text: richText.length ? {richText} : text};
  249. }
  250. richText = [];
  251. font = null;
  252. text = null;
  253. break;
  254. }
  255. }
  256. }
  257. }
  258. }
  259. async _parseStyles(entry) {
  260. this._emitEntry({type: 'styles'});
  261. if (this.options.styles === 'cache') {
  262. this.styles = new StyleManager();
  263. await this.styles.parseStream(iterateStream(entry));
  264. }
  265. }
  266. *_parseWorksheet(iterator, sheetNo) {
  267. this._emitEntry({type: 'worksheet', id: sheetNo});
  268. const worksheetReader = new WorksheetReader({
  269. workbook: this,
  270. id: sheetNo,
  271. iterator,
  272. options: this.options,
  273. });
  274. const matchingRel = (this.workbookRels || []).find(
  275. rel => rel.Target === `worksheets/sheet${sheetNo}.xml`
  276. );
  277. const matchingSheet =
  278. matchingRel && (this.model.sheets || []).find(sheet => sheet.rId === matchingRel.Id);
  279. if (matchingSheet) {
  280. worksheetReader.id = matchingSheet.id;
  281. worksheetReader.name = matchingSheet.name;
  282. worksheetReader.state = matchingSheet.state;
  283. }
  284. if (this.options.worksheets === 'emit') {
  285. yield {eventType: 'worksheet', value: worksheetReader};
  286. }
  287. }
  288. *_parseHyperlinks(iterator, sheetNo) {
  289. this._emitEntry({type: 'hyperlinks', id: sheetNo});
  290. const hyperlinksReader = new HyperlinkReader({
  291. workbook: this,
  292. id: sheetNo,
  293. iterator,
  294. options: this.options,
  295. });
  296. if (this.options.hyperlinks === 'emit') {
  297. yield {eventType: 'hyperlinks', value: hyperlinksReader};
  298. }
  299. }
  300. }
  301. // for reference - these are the valid values for options
  302. WorkbookReader.Options = {
  303. worksheets: ['emit', 'ignore'],
  304. sharedStrings: ['cache', 'emit', 'ignore'],
  305. hyperlinks: ['cache', 'emit', 'ignore'],
  306. styles: ['cache', 'ignore'],
  307. entries: ['emit', 'ignore'],
  308. };
  309. module.exports = WorkbookReader;