From 1ccf1003b46993c1c30f7e16fc7b0fa2e7dd3c2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2=20=D0=9E=D0=BB=D0=B5?= =?UTF-8?q?=D0=B3?= Date: Fri, 12 Jan 2024 16:05:01 +0300 Subject: [PATCH 1/2] https://github.com/morungos/node-word-extractor/issues/54 Tries to prevent multi-byte characters from breaking. Unfortunately, we need to setEncoding() before actually reading contents, to avoid such breaking. Which means, we won't know the encoding yet. Right know UTF-8 is assumed, but OOXML files might be UTF-16 too. Haven't tested those as yet. --- lib/open-office-extractor.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/open-office-extractor.js b/lib/open-office-extractor.js index 5e5bb72..d4a3691 100644 --- a/lib/open-office-extractor.js +++ b/lib/open-office-extractor.js @@ -315,6 +315,7 @@ class OpenOfficeExtractor { if (err) { return reject(err); } + readStream.setEncoding('UTF-8'); this._source = entry.fileName; const parser = this.createXmlParser(); @@ -327,12 +328,18 @@ class OpenOfficeExtractor { readStream.on("error", (e) => reject(e)); readStream.on("readable", () => { // eslint-disable-next-line no-constant-condition + + // console.log(`XMLDecl:\n${JSON.stringify(parser.xmlDecl)}`); // undefined // {"version":"1.0","encoding":"UTF-8","standalone":"yes"} + if (parser.xmlDecl['encoding'] !== undefined) { + readStream.setEncoding(parser.xmlDecl['encoding']); + } + while (true) { const chunk = readStream.read(0x1000); if (chunk === null) { return; } - + parser.write(chunk); } }); From 66d711f5901605631509f988faff104cfbe20941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B2=D0=B0=D0=BD=D0=BE=D0=B2=20=D0=9E=D0=BB=D0=B5?= =?UTF-8?q?=D0=B3?= Date: Fri, 12 Jan 2024 16:09:20 +0300 Subject: [PATCH 2/2] OOXML stands for "Office Open XML", not "Open Office XML". --- CHANGELOG.md | 2 +- README.md | 2 +- __tests__/06_openoffice_files_extract_test.js | 8 ++++---- __tests__/07_openoffice_buffers_extract_test.js | 8 ++++---- lib/open-office-extractor.js | 8 ++++---- lib/word-ole-extractor.js | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a45dc30..4ec9adc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ ### 1.0.0 / 16th May 2021 * Major refactoring of the OLE code to use promises internally - * Added support for Open Office XML-based (.docx) Word files. See #1 + * Added support for Office Open XML-based (.docx) Word files. See #1 * Added support for reading direct from a Buffer. See #11 * Removed event-stream dependency. See #19 * Fixed an issue with not closing files properly. See #23 diff --git a/README.md b/README.md index e16b7bc..453565c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ This means you do not need to install Word, Office, or anything else, and the module will work on all platforms, without any native binary code requirements. As of version 1.0, this module supports both traditional, OLE-based, Word files (usually .doc), -and modern, Open Office-style, ECMA-376 Word files (usually .docx). It can be +and modern, Office Open XML-style, ECMA-376 Word files (usually .docx). It can be used both with files and with file contents in a Node.js Buffer. ### How do I install this module? diff --git a/__tests__/06_openoffice_files_extract_test.js b/__tests__/06_openoffice_files_extract_test.js index acea553..a13b1bd 100644 --- a/__tests__/06_openoffice_files_extract_test.js +++ b/__tests__/06_openoffice_files_extract_test.js @@ -1,9 +1,9 @@ /** * @overview - * Snapshot tests for all OpenOffice (.docx) files. The useful thing about - * this is it detects changes, but also the snapshots include the binary - * values and characters, so we see exactly what is returned, which is - * extremely useful for debugging. + * Snapshot tests for all Office Open XML (.docx) files. The useful thing about + * this is it detects changes, but also the snapshots include the binary values + * and characters, so we see exactly what is returned, which is extremely + * useful for debugging. */ const fs = require('fs'); diff --git a/__tests__/07_openoffice_buffers_extract_test.js b/__tests__/07_openoffice_buffers_extract_test.js index 410fe08..b57a885 100644 --- a/__tests__/07_openoffice_buffers_extract_test.js +++ b/__tests__/07_openoffice_buffers_extract_test.js @@ -1,9 +1,9 @@ /** * @overview - * Snapshot tests for all OpenOffice (.docx) files. The useful thing about - * this is it detects changes, but also the snapshots include the binary - * values and characters, so we see exactly what is returned, which is - * extremely useful for debugging. + * Snapshot tests for all Office Open XML (.docx) files. The useful thing about + * this is it detects changes, but also the snapshots include the binary values + * and characters, so we see exactly what is returned, which is extremely + * useful for debugging. */ const fs = require('fs'); diff --git a/lib/open-office-extractor.js b/lib/open-office-extractor.js index d4a3691..93ae6bb 100644 --- a/lib/open-office-extractor.js +++ b/lib/open-office-extractor.js @@ -3,13 +3,13 @@ * @module open-office-extractor * * @description - * Implements the main Open Office format extractor. Open Office .docx files + * Implements the main Office Open XML format extractor. Office Open XML .docx files * are essentially zip files containing streams, and each of these streams contains * XML content in one form or another. So we need to use {@link zlib} to extract * the streams, and something like `sax-js` to parse the XML that we find * there. * - * We probably don't need the whole of the Open Office data, we're only likely + * We probably don't need the whole of the Office Open XML data, we're only likely * to need a few streams. Sadly, the documentation for the file format is literally * 5000 pages. * Note that [WordOleExtractor]{@link module:word-ole-extractor~WordOleExtractor} is @@ -39,7 +39,7 @@ function each(callback, array, index) { /** * @class - * The main class implementing extraction from Open Office Word files. + * The main class implementing extraction from Office Open XML Word files. */ class OpenOfficeExtractor { @@ -121,7 +121,7 @@ class OpenOfficeExtractor { // Re-order, so the content types are always loaded first const index = entryNames.indexOf('[Content_Types].xml'); if (index === -1) { - throw new Error("Invalid Open Office XML: missing content types"); + throw new Error("Invalid Office Open XML: missing content types"); } entryNames.splice(index, 1); diff --git a/lib/word-ole-extractor.js b/lib/word-ole-extractor.js index 35381bb..e0d87f7 100644 --- a/lib/word-ole-extractor.js +++ b/lib/word-ole-extractor.js @@ -6,7 +6,7 @@ * Depends on [OleCompoundDoc]{@link module:ole-compound-doc~OleCompoundDoc} * for most of the underlying OLE logic. Note that * [OpenOfficeExtractor]{@link module:open-office-extractor~OpenOfficeExtractor} is - * used for newer, Open Office-style, files. + * used for newer, Office Open XML-style, files. */ const OleCompoundDoc = require('./ole-compound-doc');