diff --git a/CHANGELOG.md b/CHANGELOG.md index a45dc30..4ec9adc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ ### 1.0.0 / 16th May 2021 * Major refactoring of the OLE code to use promises internally - * Added support for Open Office XML-based (.docx) Word files. See #1 + * Added support for Office Open XML-based (.docx) Word files. See #1 * Added support for reading direct from a Buffer. See #11 * Removed event-stream dependency. See #19 * Fixed an issue with not closing files properly. See #23 diff --git a/README.md b/README.md index e16b7bc..453565c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ This means you do not need to install Word, Office, or anything else, and the module will work on all platforms, without any native binary code requirements. As of version 1.0, this module supports both traditional, OLE-based, Word files (usually .doc), -and modern, Open Office-style, ECMA-376 Word files (usually .docx). It can be +and modern, Office Open XML-style, ECMA-376 Word files (usually .docx). It can be used both with files and with file contents in a Node.js Buffer. ### How do I install this module? diff --git a/__tests__/06_openoffice_files_extract_test.js b/__tests__/06_openoffice_files_extract_test.js index acea553..a13b1bd 100644 --- a/__tests__/06_openoffice_files_extract_test.js +++ b/__tests__/06_openoffice_files_extract_test.js @@ -1,9 +1,9 @@ /** * @overview - * Snapshot tests for all OpenOffice (.docx) files. The useful thing about - * this is it detects changes, but also the snapshots include the binary - * values and characters, so we see exactly what is returned, which is - * extremely useful for debugging. + * Snapshot tests for all Office Open XML (.docx) files. The useful thing about + * this is it detects changes, but also the snapshots include the binary values + * and characters, so we see exactly what is returned, which is extremely + * useful for debugging. */ const fs = require('fs'); diff --git a/__tests__/07_openoffice_buffers_extract_test.js b/__tests__/07_openoffice_buffers_extract_test.js index 410fe08..b57a885 100644 --- a/__tests__/07_openoffice_buffers_extract_test.js +++ b/__tests__/07_openoffice_buffers_extract_test.js @@ -1,9 +1,9 @@ /** * @overview - * Snapshot tests for all OpenOffice (.docx) files. The useful thing about - * this is it detects changes, but also the snapshots include the binary - * values and characters, so we see exactly what is returned, which is - * extremely useful for debugging. + * Snapshot tests for all Office Open XML (.docx) files. The useful thing about + * this is it detects changes, but also the snapshots include the binary values + * and characters, so we see exactly what is returned, which is extremely + * useful for debugging. */ const fs = require('fs'); diff --git a/lib/open-office-extractor.js b/lib/open-office-extractor.js index 5e5bb72..93ae6bb 100644 --- a/lib/open-office-extractor.js +++ b/lib/open-office-extractor.js @@ -3,13 +3,13 @@ * @module open-office-extractor * * @description - * Implements the main Open Office format extractor. Open Office .docx files + * Implements the main Office Open XML format extractor. Office Open XML .docx files * are essentially zip files containing streams, and each of these streams contains * XML content in one form or another. So we need to use {@link zlib} to extract * the streams, and something like `sax-js` to parse the XML that we find * there. * - * We probably don't need the whole of the Open Office data, we're only likely + * We probably don't need the whole of the Office Open XML data, we're only likely * to need a few streams. Sadly, the documentation for the file format is literally * 5000 pages. * Note that [WordOleExtractor]{@link module:word-ole-extractor~WordOleExtractor} is @@ -39,7 +39,7 @@ function each(callback, array, index) { /** * @class - * The main class implementing extraction from Open Office Word files. + * The main class implementing extraction from Office Open XML Word files. */ class OpenOfficeExtractor { @@ -121,7 +121,7 @@ class OpenOfficeExtractor { // Re-order, so the content types are always loaded first const index = entryNames.indexOf('[Content_Types].xml'); if (index === -1) { - throw new Error("Invalid Open Office XML: missing content types"); + throw new Error("Invalid Office Open XML: missing content types"); } entryNames.splice(index, 1); @@ -315,6 +315,7 @@ class OpenOfficeExtractor { if (err) { return reject(err); } + readStream.setEncoding('UTF-8'); this._source = entry.fileName; const parser = this.createXmlParser(); @@ -327,12 +328,18 @@ class OpenOfficeExtractor { readStream.on("error", (e) => reject(e)); readStream.on("readable", () => { // eslint-disable-next-line no-constant-condition + + // console.log(`XMLDecl:\n${JSON.stringify(parser.xmlDecl)}`); // undefined // {"version":"1.0","encoding":"UTF-8","standalone":"yes"} + if (parser.xmlDecl['encoding'] !== undefined) { + readStream.setEncoding(parser.xmlDecl['encoding']); + } + while (true) { const chunk = readStream.read(0x1000); if (chunk === null) { return; } - + parser.write(chunk); } }); diff --git a/lib/word-ole-extractor.js b/lib/word-ole-extractor.js index 35381bb..e0d87f7 100644 --- a/lib/word-ole-extractor.js +++ b/lib/word-ole-extractor.js @@ -6,7 +6,7 @@ * Depends on [OleCompoundDoc]{@link module:ole-compound-doc~OleCompoundDoc} * for most of the underlying OLE logic. Note that * [OpenOfficeExtractor]{@link module:open-office-extractor~OpenOfficeExtractor} is - * used for newer, Open Office-style, files. + * used for newer, Office Open XML-style, files. */ const OleCompoundDoc = require('./ole-compound-doc');