morungos · Oliverity · Jan 12, 2024 · Jan 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,7 @@
 ### 1.0.0 / 16th May 2021
 
  * Major refactoring of the OLE code to use promises internally
- * Added support for Open Office XML-based (.docx) Word files. See #1
+ * Added support for Office Open XML-based (.docx) Word files. See #1
  * Added support for reading direct from a Buffer. See #11
  * Removed event-stream dependency. See #19
  * Fixed an issue with not closing files properly. See #23

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ This means you do not need to install Word, Office, or anything else, and the
 module will work on all platforms, without any native binary code requirements.
 
 As of version 1.0, this module supports both traditional, OLE-based, Word files (usually .doc), 
-and modern, Open Office-style, ECMA-376 Word files (usually .docx). It can be 
+and modern, Office Open XML-style, ECMA-376 Word files (usually .docx). It can be 
 used both with files and with file contents in a Node.js Buffer.
 
 ### How do I install this module?

diff --git a/__tests__/06_openoffice_files_extract_test.js b/__tests__/06_openoffice_files_extract_test.js
@@ -1,9 +1,9 @@
 /**
  * @overview
- * Snapshot tests for all OpenOffice (.docx) files. The useful thing about
- * this is it detects changes, but also the snapshots include the binary
- * values and characters, so we see exactly what is returned, which is
- * extremely useful for debugging.
+ * Snapshot tests for all Office Open XML (.docx) files. The useful thing about
+ * this is it detects changes, but also the snapshots include the binary values
+ * and characters, so we see exactly what is returned, which is extremely
+ * useful for debugging.
  */
 
 const fs = require('fs');

diff --git a/__tests__/07_openoffice_buffers_extract_test.js b/__tests__/07_openoffice_buffers_extract_test.js
@@ -1,9 +1,9 @@
 /**
  * @overview
- * Snapshot tests for all OpenOffice (.docx) files. The useful thing about
- * this is it detects changes, but also the snapshots include the binary
- * values and characters, so we see exactly what is returned, which is
- * extremely useful for debugging.
+ * Snapshot tests for all Office Open XML (.docx) files. The useful thing about
+ * this is it detects changes, but also the snapshots include the binary values
+ * and characters, so we see exactly what is returned, which is extremely
+ * useful for debugging.
  */
 
 const fs = require('fs');

diff --git a/lib/open-office-extractor.js b/lib/open-office-extractor.js
@@ -3,13 +3,13 @@
  * @module open-office-extractor
  * 
  * @description
- * Implements the main Open Office format extractor. Open Office .docx files
+ * Implements the main Office Open XML format extractor. Office Open XML .docx files
  * are essentially zip files containing streams, and each of these streams contains
  * XML content in one form or another. So we need to use {@link zlib} to extract
  * the streams, and something like `sax-js` to parse the XML that we find 
  * there. 
  * 
- * We probably don't need the whole of the Open Office data, we're only likely
+ * We probably don't need the whole of the Office Open XML data, we're only likely
  * to need a few streams. Sadly, the documentation for the file format is literally
  * 5000 pages.
  * Note that [WordOleExtractor]{@link module:word-ole-extractor~WordOleExtractor} is 
@@ -39,7 +39,7 @@ function each(callback, array, index) {
 
 /**
  * @class
- * The main class implementing extraction from Open Office Word files.
+ * The main class implementing extraction from Office Open XML Word files.
  */
 class OpenOfficeExtractor {
 
@@ -121,7 +121,7 @@ class OpenOfficeExtractor {
         // Re-order, so the content types are always loaded first
         const index = entryNames.indexOf('[Content_Types].xml');
         if (index === -1) {
-          throw new Error("Invalid Open Office XML: missing content types");
+          throw new Error("Invalid Office Open XML: missing content types");
         }
 
         entryNames.splice(index, 1);
@@ -315,6 +315,7 @@ class OpenOfficeExtractor {
         if (err) {
           return reject(err);
         }
+        readStream.setEncoding('UTF-8');
 
         this._source = entry.fileName;
         const parser = this.createXmlParser();
@@ -327,12 +328,18 @@ class OpenOfficeExtractor {
         readStream.on("error", (e) => reject(e));
         readStream.on("readable", () => {
           // eslint-disable-next-line no-constant-condition
+
+          // console.log(`XMLDecl:\n${JSON.stringify(parser.xmlDecl)}`); // undefined // {"version":"1.0","encoding":"UTF-8","standalone":"yes"}
+          if (parser.xmlDecl['encoding'] !== undefined) {
+            readStream.setEncoding(parser.xmlDecl['encoding']);
+          }
+
           while (true) {
             const chunk = readStream.read(0x1000);
             if (chunk === null) {
               return;
             }
-      
+
             parser.write(chunk);
           }
         });

diff --git a/lib/word-ole-extractor.js b/lib/word-ole-extractor.js
@@ -6,7 +6,7 @@
  * Depends on [OleCompoundDoc]{@link module:ole-compound-doc~OleCompoundDoc} 
  * for most of the underlying OLE logic. Note that
  * [OpenOfficeExtractor]{@link module:open-office-extractor~OpenOfficeExtractor} is 
- * used for newer, Open Office-style, files. 
+ * used for newer, Office Open XML-style, files. 
  */
 
 const OleCompoundDoc = require('./ole-compound-doc');