Skip to content

Commit 42992fb

Browse files
authored
Merge pull request #60 from GMOD/htsget
Htsget data fetching
2 parents bb7cd26 + d0b943e commit 42992fb

File tree

16 files changed

+331
-87
lines changed

16 files changed

+331
-87
lines changed

.eslintrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"curly": "error",
1616
"@typescript-eslint/no-explicit-any": 0,
1717
"@typescript-eslint/explicit-function-return-type": 0,
18+
"@typescript-eslint/ban-ts-ignore": 0,
1819
"semi": [
1920
"error",
2021
"never"

package.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@
4141
"dependencies": {
4242
"@babel/runtime-corejs3": "^7.5.5",
4343
"@gmod/bgzf-filehandle": "^1.3.3",
44-
"@types/long": "^4.0.0",
45-
"@types/node": "^12.7.8",
4644
"abortable-promise-cache": "^1.2.0",
45+
"buffer-crc32": "^0.2.13",
4746
"cross-fetch": "^3.0.2",
4847
"es6-promisify": "^6.0.1",
4948
"generic-filehandle": "^2.0.0",
5049
"long": "^4.0.0",
5150
"object.entries-ponyfill": "^1.0.1",
52-
"quick-lru": "^2.0.0"
51+
"quick-lru": "^2.0.0",
52+
"range-parser": "^1.2.1"
5353
},
5454
"devDependencies": {
5555
"@babel/cli": "^7.2.3",
@@ -58,6 +58,10 @@
5858
"@babel/plugin-transform-runtime": "^7.2.0",
5959
"@babel/preset-env": "^7.3.1",
6060
"@babel/preset-typescript": "^7.6.0",
61+
"@types/buffer-crc32": "^0.2.0",
62+
"@types/long": "^4.0.0",
63+
"@types/node": "^12.7.8",
64+
"@types/range-parser": "^1.2.3",
6165
"@typescript-eslint/eslint-plugin": "^2.3.1",
6266
"@typescript-eslint/parser": "^2.3.1",
6367
"babel-jest": "^24.1.0",

src/bai.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import VirtualOffset, { fromBytes } from './virtualOffset'
33
import Chunk from './chunk'
44

55
import IndexFile from './indexFile'
6-
import { longToNumber, abortBreakPoint, canMergeBlocks } from './util'
6+
import { longToNumber, abortBreakPoint, canMergeBlocks, BaseOpts } from './util'
77

88
const BAI_MAGIC = 21578050 // BAI\1
99

@@ -22,8 +22,9 @@ export default class BAI extends IndexFile {
2222
return { lineCount }
2323
}
2424

25-
async lineCount(refId: number) {
26-
const index = (await this.parse()).indices[refId]
25+
async lineCount(refId: number, opts: BaseOpts = {}) {
26+
const prom = await this.parse(opts)
27+
const index = prom.indices[refId]
2728
if (!index) {
2829
return -1
2930
}
@@ -32,11 +33,9 @@ export default class BAI extends IndexFile {
3233
}
3334

3435
// fetch and parse the index
35-
async _parse(abortSignal?: AbortSignal) {
36+
async _parse(opts: BaseOpts = {}) {
3637
const data: { [key: string]: any } = { bai: true, maxBlockSize: 1 << 16 }
37-
const bytes = (await this.filehandle.readFile({
38-
signal: abortSignal,
39-
})) as Buffer
38+
const bytes = (await this.filehandle.readFile(opts)) as Buffer
4039

4140
// check BAI magic numbers
4241
if (bytes.readUInt32LE(0) !== BAI_MAGIC) {
@@ -51,7 +50,7 @@ export default class BAI extends IndexFile {
5150
data.indices = new Array(data.refCount)
5251
let currOffset = 8
5352
for (let i = 0; i < data.refCount; i += 1) {
54-
await abortBreakPoint(abortSignal)
53+
await abortBreakPoint(opts.signal)
5554

5655
// the binning index
5756
const binCount = bytes.readInt32LE(currOffset)
@@ -105,10 +104,11 @@ export default class BAI extends IndexFile {
105104
seqId: number,
106105
start?: number,
107106
end?: number,
107+
opts: BaseOpts = {},
108108
): Promise<{ start: number; end: number; score: number }[]> {
109109
const v = 16384
110110
const range = start !== undefined
111-
const indexData = await this.parse()
111+
const indexData = await this.parse(opts)
112112
const seqIdx = indexData.indices[seqId]
113113
if (!seqIdx) {
114114
return []
@@ -168,12 +168,12 @@ export default class BAI extends IndexFile {
168168
return list
169169
}
170170

171-
async blocksForRange(refId: number, min: number, max: number) {
171+
async blocksForRange(refId: number, min: number, max: number, opts: BaseOpts = {}) {
172172
if (min < 0) {
173173
min = 0
174174
}
175175

176-
const indexData = await this.parse()
176+
const indexData = await this.parse(opts)
177177
if (!indexData) {
178178
return []
179179
}

src/bamFile.ts

Lines changed: 37 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import AbortablePromiseCache from 'abortable-promise-cache'
22
import BAI from './bai'
33
import CSI from './csi'
44
import Chunk from './chunk'
5+
import crc32 from 'buffer-crc32'
56

67
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
78

@@ -11,30 +12,22 @@ import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
1112
import BAMFeature from './record'
1213
import IndexFile from './indexFile'
1314
import { parseHeaderText } from './sam'
14-
import { abortBreakPoint, checkAbortSignal, timeout } from './util'
15+
import { abortBreakPoint, checkAbortSignal, timeout, makeOpts, BamOpts, BaseOpts } from './util'
1516

16-
const BAM_MAGIC = 21840194
17+
export const BAM_MAGIC = 21840194
1718

1819
const blockLen = 1 << 16
19-
type G = GenericFilehandle
20-
21-
interface BamOpts {
22-
viewAsPairs?: boolean
23-
pairAcrossChr?: boolean
24-
maxInsertSize?: number
25-
signal?: AbortSignal
26-
}
2720

2821
export default class BamFile {
2922
private renameRefSeq: (a: string) => string
3023
private bam: GenericFilehandle
3124
private index: IndexFile
32-
private featureCache: any
3325
private chunkSizeLimit: number
3426
private fetchSizeLimit: number
3527
private header: any
36-
private chrToIndex: any
37-
private indexToChr: any
28+
protected featureCache: any
29+
protected chrToIndex: any
30+
protected indexToChr: any
3831

3932
/**
4033
* @param {object} args
@@ -112,14 +105,14 @@ export default class BamFile {
112105
this.chunkSizeLimit = chunkSizeLimit || 300000000 // 300MB
113106
}
114107

115-
async getHeader(abortSignal?: AbortSignal) {
116-
const indexData = await this.index.parse(abortSignal)
108+
async getHeader(origOpts: AbortSignal | BaseOpts = {}) {
109+
const opts = makeOpts(origOpts)
110+
const indexData = await this.index.parse(opts)
117111
const ret = indexData.firstDataLine ? indexData.firstDataLine.blockPosition + 65535 : undefined
118112
let buffer
119113
if (ret) {
120-
const res = await this.bam.read(Buffer.alloc(ret + blockLen), 0, ret + blockLen, 0, {
121-
signal: abortSignal,
122-
})
114+
const res = await this.bam.read(Buffer.alloc(ret + blockLen), 0, ret + blockLen, 0, opts)
115+
123116
const { bytesRead } = res
124117
;({ buffer } = res)
125118
if (!bytesRead) {
@@ -131,7 +124,7 @@ export default class BamFile {
131124
buffer = buffer.slice(0, ret)
132125
}
133126
} else {
134-
buffer = (await this.bam.readFile({ signal: abortSignal })) as Buffer
127+
buffer = (await this.bam.readFile(opts)) as Buffer
135128
}
136129

137130
const uncba = await unzip(buffer)
@@ -142,7 +135,7 @@ export default class BamFile {
142135
const headLen = uncba.readInt32LE(4)
143136

144137
this.header = uncba.toString('utf8', 8, 8 + headLen)
145-
const { chrToIndex, indexToChr } = await this._readRefSeqs(headLen + 8, 65535, abortSignal)
138+
const { chrToIndex, indexToChr } = await this._readRefSeqs(headLen + 8, 65535, opts)
146139
this.chrToIndex = chrToIndex
147140
this.indexToChr = indexToChr
148141

@@ -154,17 +147,15 @@ export default class BamFile {
154147
async _readRefSeqs(
155148
start: number,
156149
refSeqBytes: number,
157-
abortSignal?: AbortSignal,
150+
opts: BaseOpts = {},
158151
): Promise<{
159152
chrToIndex: { [key: string]: number }
160153
indexToChr: { refName: string; length: number }[]
161154
}> {
162155
if (start > refSeqBytes) {
163-
return this._readRefSeqs(start, refSeqBytes * 2)
156+
return this._readRefSeqs(start, refSeqBytes * 2, opts)
164157
}
165-
const res = await this.bam.read(Buffer.alloc(refSeqBytes + blockLen), 0, refSeqBytes, 0, {
166-
signal: abortSignal,
167-
})
158+
const res = await this.bam.read(Buffer.alloc(refSeqBytes + blockLen), 0, refSeqBytes, 0, opts)
168159
const { bytesRead } = res
169160
let { buffer } = res
170161
if (!bytesRead) {
@@ -181,7 +172,7 @@ export default class BamFile {
181172
const chrToIndex: { [key: string]: number } = {}
182173
const indexToChr: { refName: string; length: number }[] = []
183174
for (let i = 0; i < nRef; i += 1) {
184-
await abortBreakPoint(abortSignal)
175+
await abortBreakPoint(opts.signal)
185176
const lName = uncba.readInt32LE(p)
186177
let refName = uncba.toString('utf8', p + 4, p + 4 + lName - 1)
187178
refName = this.renameRefSeq(refName)
@@ -193,7 +184,7 @@ export default class BamFile {
193184
p = p + 8 + lName
194185
if (p > uncba.length) {
195186
console.warn(`BAM header is very big. Re-fetching ${refSeqBytes} bytes.`)
196-
return this._readRefSeqs(start, refSeqBytes * 2)
187+
return this._readRefSeqs(start, refSeqBytes * 2, opts)
197188
}
198189
}
199190
return { chrToIndex, indexToChr }
@@ -269,7 +260,7 @@ export default class BamFile {
269260
const c = chunks[i]
270261
const { data, cpositions, dpositions, chunk } = await this.featureCache.get(
271262
c.toString(),
272-
c,
263+
{ chunk: c, opts },
273264
opts.signal,
274265
)
275266
const promise = this.readBamFeatures(data, cpositions, dpositions, chunk).then(records => {
@@ -376,7 +367,7 @@ export default class BamFile {
376367
const mateFeatPromises = mateChunks.map(async c => {
377368
const { data, cpositions, dpositions, chunk } = await this.featureCache.get(
378369
c.toString(),
379-
c,
370+
{ chunk: c, opts },
380371
opts.signal,
381372
)
382373
const feats = await this.readBamFeatures(data, cpositions, dpositions, chunk)
@@ -398,11 +389,10 @@ export default class BamFile {
398389
return featuresRet
399390
}
400391

401-
async _readChunk(chunk: Chunk, abortSignal?: AbortSignal) {
402-
const bufsize = chunk.fetchedSize()
403-
const res = await this.bam.read(Buffer.alloc(bufsize), 0, bufsize, chunk.minv.blockPosition, {
404-
signal: abortSignal,
405-
})
392+
async _readChunk({ chunk, opts }: { chunk: unknown; opts: BaseOpts }, abortSignal?: AbortSignal) {
393+
const c = chunk as Chunk
394+
const bufsize = c.fetchedSize()
395+
const res = await this.bam.read(Buffer.alloc(bufsize), 0, bufsize, c.minv.blockPosition, opts)
406396
const { bytesRead } = res
407397
let { buffer } = res
408398
checkAbortSignal(abortSignal)
@@ -431,8 +421,10 @@ export default class BamFile {
431421
const blockEnd = blockStart + 4 + blockSize - 1
432422

433423
// increment position to the current decompressed status
434-
while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) {}
435-
pos--
424+
if (dpositions) {
425+
while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) {}
426+
pos--
427+
}
436428

437429
// only try to read the feature if we have all the bytes for it
438430
if (blockEnd < ba.length) {
@@ -442,6 +434,9 @@ export default class BamFile {
442434
start: blockStart,
443435
end: blockEnd,
444436
},
437+
// the below results in an automatically calculated file-offset based ID
438+
// if the info for that is available, otherwise crc32 of the features
439+
//
445440
// cpositions[pos] refers to actual file offset of a bgzip block boundaries
446441
//
447442
// we multiply by (1 <<8) in order to make sure each block has a "unique"
@@ -455,11 +450,12 @@ export default class BamFile {
455450
// starts at 0 instead of chunk.minv.dataPosition
456451
//
457452
// the +1 is just to avoid any possible uniqueId 0 but this does not realistically happen
458-
fileOffset:
459-
cpositions[pos] * (1 << 8) +
460-
(blockStart - dpositions[pos]) +
461-
chunk.minv.dataPosition +
462-
1,
453+
fileOffset: cpositions
454+
? cpositions[pos] * (1 << 8) +
455+
(blockStart - dpositions[pos]) +
456+
chunk.minv.dataPosition +
457+
1
458+
: crc32.signed(ba.slice(blockStart, blockEnd)),
463459
})
464460

465461
sink.push(feature)

src/csi.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,10 @@ export default class CSI extends IndexFile {
9999
}
100100

101101
// fetch and parse the index
102-
async _parse(abortSignal?: AbortSignal) {
102+
async _parse(opts: { signal?: AbortSignal }) {
103103
const data: { [key: string]: any } = { csi: true, maxBlockSize: 1 << 16 }
104-
const bytes = await unzip((await this.filehandle.readFile({ signal: abortSignal })) as Buffer)
104+
const buffer = (await this.filehandle.readFile(opts)) as Buffer
105+
const bytes = await unzip(buffer)
105106

106107
// check TBI magic numbers
107108
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
@@ -126,7 +127,7 @@ export default class CSI extends IndexFile {
126127
data.indices = new Array(data.refCount)
127128
let currOffset = 16 + auxLength + 4
128129
for (let i = 0; i < data.refCount; i += 1) {
129-
await abortBreakPoint(abortSignal)
130+
await abortBreakPoint(opts.signal)
130131
// the binning index
131132
const binCount = bytes.readInt32LE(currOffset)
132133
currOffset += 4
@@ -184,7 +185,7 @@ export default class CSI extends IndexFile {
184185
beg = 0
185186
}
186187

187-
const indexData = await this.parse(opts.signal)
188+
const indexData = await this.parse(opts)
188189
if (!indexData) {
189190
return []
190191
}

0 commit comments

Comments
 (0)