Skip to content

Commit e8c6f9d

Browse files
authored
fix: non-english char encoding, close #454, close #466 (#496)
* fix: non-english char encoding * fix: optional chaining not supported * fix: various tests not finished yet * fix: tests now pass * fix: testing and code complexity * fix: code climate * fix: change comments * fix: simplify code complexity * refactor: simplify logic and add more testing * fix: reduce cognitive complexity * fix: more cognitive complexity * chore: damn space * fix: broke a test * chore: update docs * test: re-enable utf8 test * chore: update request test
1 parent 3bd2555 commit e8c6f9d

File tree

11 files changed

+256
-42
lines changed

11 files changed

+256
-42
lines changed

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,12 @@ Parameters - object which includes:
326326

327327
Should return resolved `Promise` if resource should be saved or rejected with Error `Promise` if it should be skipped.
328328
Promise should be resolved with:
329-
* `string` which contains response body
330-
* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
329+
* the `response` object with the `body` modified in place as necessary.
330+
* or object with properties
331+
* `body` (response body, string)
332+
* `encoding` (`binary` or `utf8`) used to save the file, binary used by default.
333+
* `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
334+
* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem.
331335

332336
If multiple actions `afterResponse` added - scraper will use result from last one.
333337
```javascript
@@ -342,7 +346,8 @@ registerAction('afterResponse', ({response}) => {
342346
metadata: {
343347
headers: response.headers,
344348
someOtherData: [ 1, 2, 3 ]
345-
}
349+
},
350+
encoding: 'utf8'
346351
}
347352
}
348353
});

lib/config/defaults.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ const config = {
4848
],
4949
request: {
5050
throwHttpErrors: false,
51-
encoding: 'binary',
51+
responseType: 'buffer',
5252
//cookieJar: true,
5353
decompress: true,
5454
headers: {

lib/plugins/save-resource-to-fs-plugin.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class SaveResourceToFileSystemPlugin {
2020
registerAction('saveResource', async ({resource}) => {
2121
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
2222
const text = resource.getText();
23-
await fs.outputFile(filename, text, { encoding: 'binary' });
23+
await fs.outputFile(filename, text, { encoding: resource.getEncoding() });
2424
loadedResources.push(resource);
2525
});
2626

lib/request.js

Lines changed: 68 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,78 @@
11
import got from 'got';
22
import logger from './logger.js';
3-
import { extend, isPlainObject } from './utils/index.js';
3+
import { extend } from './utils/index.js';
44

55
function getMimeType (contentType) {
66
return contentType ? contentType.split(';')[0] : null;
77
}
88

99
function defaultResponseHandler ({response}) {
10-
return Promise.resolve(response.body);
10+
return Promise.resolve(response);
11+
}
12+
13+
function extractEncodingFromHeader (headers) {
14+
const contentTypeHeader = headers['content-type'];
15+
16+
return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
17+
}
18+
19+
function getEncoding (response) {
20+
if (response && typeof response === 'object') {
21+
if (response.headers && typeof response.headers === 'object') {
22+
return extractEncodingFromHeader(response.headers);
23+
} else if (response.encoding) {
24+
return response.encoding;
25+
}
26+
}
27+
28+
return 'binary';
29+
}
30+
31+
function throwTypeError (result) {
32+
let type = typeof result;
33+
34+
if (result instanceof Error) {
35+
throw result;
36+
} else if (type === 'object' && Array.isArray(result)) {
37+
type = 'array';
38+
}
39+
40+
throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`);
41+
}
42+
43+
function getData (result) {
44+
let data = result;
45+
if (result && typeof result === 'object' && 'body' in result) {
46+
data = result.body;
47+
}
48+
49+
return data;
1150
}
1251

1352
function transformResult (result) {
14-
switch (true) {
15-
case typeof result === 'string':
16-
return {
17-
body: result,
18-
metadata: null
19-
};
20-
case isPlainObject(result):
21-
return {
22-
body: result.body,
23-
metadata: result.metadata || null
24-
};
25-
case result === null:
26-
return null;
27-
default:
28-
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
53+
const encoding = getEncoding(result);
54+
const data = getData(result);
55+
56+
// Check for no data
57+
if (data === null || data === undefined) {
58+
return null;
2959
}
60+
61+
// Then stringify it.
62+
let body = null;
63+
if (data instanceof Buffer) {
64+
body = data.toString(encoding);
65+
} else if (typeof data === 'string') {
66+
body = data;
67+
} else {
68+
throwTypeError(result);
69+
}
70+
71+
return {
72+
body,
73+
encoding,
74+
metadata: result.metadata || data.metadata || null
75+
};
3076
}
3177

3278
async function getRequest ({url, referer, options = {}, afterResponse = defaultResponseHandler}) {
@@ -50,10 +96,13 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
5096
url: response.url,
5197
mimeType: getMimeType(response.headers['content-type']),
5298
body: responseHandlerResult.body,
53-
metadata: responseHandlerResult.metadata
99+
metadata: responseHandlerResult.metadata,
100+
encoding: responseHandlerResult.encoding
54101
};
55102
}
56103

57104
export default {
58-
get: getRequest
105+
get: getRequest,
106+
getEncoding,
107+
transformResult
59108
};

lib/resource.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class Resource {
1212
this.children = [];
1313

1414
this.saved = false;
15+
this.encoding = 'binary';
1516
}
1617

1718
createChild (url, filename) {
@@ -69,6 +70,14 @@ class Resource {
6970
return this.type;
7071
}
7172

73+
setEncoding (encoding) {
74+
this.encoding = encoding;
75+
}
76+
77+
getEncoding () {
78+
return this.encoding;
79+
}
80+
7281
isHtml () {
7382
return this.getType() === types.html;
7483
}

lib/scraper.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ class Scraper {
170170
self.requestedResourcePromises.set(responseData.url, requestPromise);
171171
}
172172

173+
resource.setEncoding(responseData.encoding);
173174
resource.setType(getTypeByMime(responseData.mimeType));
174175

175176
const { filename } = await self.runActions('generateFilename', { resource, responseData });
Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import '../../utils/assertions.js';
22
import nock from 'nock';
3-
import fs from 'fs-extra';
3+
import fs from 'fs/promises';
44
import scrape from 'website-scraper';
55

66
const testDirname = './test/functional/encoding/.tmp';
77
const mockDirname = './test/functional/encoding/mocks';
88

9-
// TODO: enable test when encoding issue is fixed
10-
xdescribe('Functional: Korean characters are properly encoded/decoded', function() {
9+
describe('Functional: UTF8 characters are properly encoded/decoded', () => {
1110
const options = {
1211
urls: [
1312
'http://example.com/',
@@ -16,27 +15,28 @@ xdescribe('Functional: Korean characters are properly encoded/decoded', function
1615
ignoreErrors: false
1716
};
1817

19-
beforeEach(function() {
18+
beforeEach(() => {
2019
nock.cleanAll();
2120
nock.disableNetConnect();
2221
});
2322

24-
afterEach(function() {
23+
afterEach(async () => {
2524
nock.cleanAll();
2625
nock.enableNetConnect();
27-
fs.removeSync(testDirname);
26+
await fs.rm(testDirname, { recursive: true, force: true });
2827
});
2928

3029
beforeEach(() => {
31-
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
30+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html; charset=utf-8'});
3231
});
3332

34-
it('should save the page in the same data as it was originally', () => {
35-
return scrape(options).then(function(result) {
36-
const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString();
37-
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
38-
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
39-
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
40-
});
33+
it('should save the page in the same data as it was originally', async () => {
34+
await scrape(options);
35+
36+
const scrapedIndex = await fs.readFile(testDirname + '/index.html', { encoding: 'utf8' });
37+
scrapedIndex.should.be.containEql('<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>');
38+
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Слава Україні!</div>');
39+
scrapedIndex.should.be.containEql('<div id="special-characters-chinese">加入网站</div>');
40+
scrapedIndex.should.be.containEql('<div id="special-characters-ukrainian">Обладнання та ПЗ</div>');
4141
});
4242
});

test/functional/encoding/mocks/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@
88
<div id="special-characters-korean">저는 7년 동안 한국에서 살았어요.</div>
99
<div id="special-characters-ukrainian">Слава Україні!</div>
1010
<div id="special-characters-chinese">加入网站</div>
11+
<div id="special-characters-ukrainian">Обладнання та ПЗ</div>
1112
</body>
1213
</html>

0 commit comments

Comments
 (0)